Replace chars with textlines

* Add split function * Add split_text and shift_text params * Change get_rotation * Move get_column_index to utils * Add split_text and shift_text * Fix split_text
2016-10-12 13:17:02 +05:30 · 2016-10-12 13:17:02 +05:30 · a43d5ca2c7
parent 02ef332bd6
commit a43d5ca2c7
7 changed files with 590 additions and 428 deletions
--- a/camelot/lattice.py
+++ b/camelot/lattice.py
@ -8,10 +8,10 @@ import subprocess
 from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
                      find_table_joints)
 from .table import Table
-from .utils import (scale_to_pdf, scale_to_image, get_rotation, segments_bbox,
+from .utils import (scale_to_pdf, scale_to_image, get_rotation, rotate_segments,
-                    text_bbox, merge_close_values, get_row_index,
+                    rotate_textlines, rotate_table, segments_bbox, text_in_bbox,
-                    get_column_index, get_score, count_empty, encode_list,
+                    merge_close_values, get_table_index, get_score, count_empty,
-                    get_text_objects, get_page_layout)
+                    encode_list, get_text_objects, get_page_layout)
 __all__ = ['Lattice']
@ -25,6 +25,52 @@ def _reduce_method(m):
 copy_reg.pickle(types.MethodType, _reduce_method)
 def _reduce_index(t, idx, shift_text):
    """Reduces index of a text object if it lies within a spanning
    cell taking in account table rotation.
    Parameters
    ----------
    table : object
        camelot.table.Table
    idx : list
        List of tuples of the form (r_idx, c_idx, text).
    shift_text : list
        {'l', 'r', 't', 'b'}
        Select one or more from above and pass them as a list to
        specify where the text in a spanning cell should flow.
    Returns
    -------
    indices : list
        List of tuples of the form (idx, text) where idx is the reduced
        index of row/column and text is the an lttextline substring.
    """
    indices = []
    for r_idx, c_idx, text in idx:
        for d in shift_text:
            if d == 'l':
                if t.cells[r_idx][c_idx].spanning_h:
                    while not t.cells[r_idx][c_idx].left:
                        c_idx -= 1
            if d == 'r':
                if t.cells[r_idx][c_idx].spanning_h:
                    while not t.cells[r_idx][c_idx].right:
                        c_idx += 1
            if d == 't':
                if t.cells[r_idx][c_idx].spanning_v:
                    while not t.cells[r_idx][c_idx].top:
                        r_idx -= 1
            if d == 'b':
                if t.cells[r_idx][c_idx].spanning_v:
                    while not t.cells[r_idx][c_idx].bottom:
                        r_idx += 1
        indices.append((r_idx, c_idx, text))
    return indices
 def _fill_spanning(t, fill=None):
    """Fills spanning cells.
@ -67,78 +113,6 @@ def _fill_spanning(t, fill=None):
    return t
 def _outline(t):
    """Sets table border edges to True.
    Parameters
    ----------
    t : object
        camelot.table.Table
    Returns
    -------
    t : object
        camelot.table.Table
    """
    for i in range(len(t.cells)):
        t.cells[i][0].left = True
        t.cells[i][len(t.cells[i]) - 1].right = True
    for i in range(len(t.cells[0])):
        t.cells[0][i].top = True
        t.cells[len(t.cells) - 1][i].bottom = True
    return t
 def _reduce_index(t, rotation, r_idx, c_idx):
    """Reduces index of a text object if it lies within a spanning
    cell taking in account table rotation.
    Parameters
    ----------
    t : object
        camelot.table.Table
    rotation : string
        {'', 'left', 'right'}
    r_idx : int
        Current row index.
    c_idx : int
        Current column index.
    Returns
    -------
    r_idx : int
        Reduced row index.
    c_idx : int
        Reduced column index.
    """
    if not rotation:
        if t.cells[r_idx][c_idx].spanning_h:
            while not t.cells[r_idx][c_idx].left:
                c_idx -= 1
        if t.cells[r_idx][c_idx].spanning_v:
            while not t.cells[r_idx][c_idx].top:
                r_idx -= 1
    elif rotation == 'left':
        if t.cells[r_idx][c_idx].spanning_h:
            while not t.cells[r_idx][c_idx].left:
                c_idx -= 1
        if t.cells[r_idx][c_idx].spanning_v:
            while not t.cells[r_idx][c_idx].bottom:
                r_idx += 1
    elif rotation == 'right':
        if t.cells[r_idx][c_idx].spanning_h:
            while not t.cells[r_idx][c_idx].right:
                c_idx += 1
        if t.cells[r_idx][c_idx].spanning_v:
            while not t.cells[r_idx][c_idx].top:
                r_idx -= 1
    return r_idx, c_idx
 class Lattice:
    """Lattice looks for lines in the pdf to form a table.
@ -179,6 +153,17 @@ class Lattice:
        PDFMiner margins. (char_margin, line_margin, word_margin)
        (optional, default: (1.0, 0.5, 0.1))
    split_text : bool
        Whether or not to split a text line if it spans across
        different cells.
        (optional, default: False)
    shift_text : list
        {'l', 'r', 't', 'b'}
        Select one or more from above and pass them as a list to
        specify where the text in a spanning cell should flow.
        (optional, default: ['l', 't'])
    debug : string
        {'contour', 'line', 'joint', 'table'}
        Set to one of the above values to generate a matplotlib plot
@ -186,7 +171,8 @@ class Lattice:
        (optional, default: None)
    """
    def __init__(self, table_area=None, fill=None, mtol=[2], scale=15,
-                 invert=False, margins=(1.0, 0.5, 0.1), debug=None):
+                 invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
                 shift_text=['l', 't'], debug=None):
        self.method = 'lattice'
        self.table_area = table_area
@ -195,6 +181,8 @@ class Lattice:
        self.scale = scale
        self.invert = invert
        self.char_margin, self.line_margin, self.word_margin = margins
        self.split_text = split_text
        self.shift_text = shift_text
        self.debug = debug
    def get_tables(self, pdfname):
@ -211,9 +199,9 @@ class Lattice:
        """
        layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
            line_margin=self.line_margin, word_margin=self.word_margin)
-        ltchar = get_text_objects(layout, LTType="char")
+        lttextlh = get_text_objects(layout, ltype="lh")
-        lttextlh = get_text_objects(layout, LTType="lh")
+        lttextlv = get_text_objects(layout, ltype="lv")
-        lttextlv = get_text_objects(layout, LTType="lv")
+        ltchar = get_text_objects(layout, ltype="char")
        width, height = dim
        bname, __ = os.path.splitext(pdfname)
        if not ltchar:
@ -287,11 +275,15 @@ class Lattice:
            # select elements which lie within table_bbox
            table_data = {}
            v_s, h_s = segments_bbox(k, v_segments, h_segments)
-            char_bbox = text_bbox(k, ltchar)
+            lh_bbox = text_in_bbox(k, lttextlh)
-            lh_bbox = text_bbox(k, lttextlh)
+            lv_bbox = text_in_bbox(k, lttextlv)
-            lv_bbox = text_bbox(k, lttextlv)
+            char_bbox = text_in_bbox(k, ltchar)
            table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
-            table_rotation = get_rotation(char_bbox, lh_bbox, lv_bbox)
+            table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
            v_s, h_s = rotate_segments(v_s, h_s, table_rotation)
            t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
            for direction in t_bbox:
                t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
            cols, rows = zip(*table_bbox[k])
            cols, rows = list(cols), list(rows)
            cols.extend([k[0], k[2]])
@ -305,6 +297,7 @@ class Lattice:
                    for i in range(0, len(cols) - 1)]
            rows = [(rows[i], rows[i + 1])
                    for i in range(0, len(rows) - 1)]
            rows, cols = rotate_table(rows, cols, table_rotation)
            table = Table(cols, rows)
            # set table edges to True using ver+hor lines
            table = table.set_edges(v_s, h_s)
@ -313,58 +306,26 @@ class Lattice:
            # set spanning cells to True
            table = table.set_spanning()
            # set table border edges to True
-            table = _outline(table)
+            table = table.set_border_edges()
            if self.debug:
                self.debug_tables.append(table)
-            rerror = []
+            assignment_errors = []
-            cerror = []
+            for direction in t_bbox:
-            for t in char_bbox:
+                for t in t_bbox[direction]:
-                try:
+                    indices, error = get_table_index(
-                    r_idx, rass_error = get_row_index(t, rows)
+                        table, t, direction, split_text=self.split_text)
-                except TypeError:
+                    assignment_errors.append(error)
-                    # couldn't assign LTChar to any cell
+                    indices = _reduce_index(table, indices, shift_text=self.shift_text)
-                    continue
+                    for r_idx, c_idx, text in indices:
-                try:
+                        table.cells[r_idx][c_idx].add_text(text)
-                    c_idx, cass_error = get_column_index(t, cols)
+            score = get_score([[100, assignment_errors]])
                except TypeError:
                    # couldn't assign LTChar to any cell
                    continue
                rerror.append(rass_error)
                cerror.append(cass_error)
                r_idx, c_idx = _reduce_index(table, table_rotation, r_idx, c_idx)
                table.cells[r_idx][c_idx].add_object(t)
            for i in range(len(table.cells)):
                for j in range(len(table.cells[i])):
                    t_bbox = table.cells[i][j].get_objects()
                    try:
                        cell_rotation = get_rotation(t_bbox)
                    except ZeroDivisionError:
                        cell_rotation = ''
                        pass
                    # fill text after sorting it
                    if cell_rotation == '':
                        t_bbox.sort(key=lambda x: (-x.y0, x.x0))
                    elif cell_rotation == 'left':
                        t_bbox.sort(key=lambda x: (x.x0, x.y0))
                    elif cell_rotation == 'right':
                        t_bbox.sort(key=lambda x: (-x.x0, -x.y0))
                    table.cells[i][j].add_text(''.join([t.get_text()
                        for t in t_bbox]))
            score = get_score([[50, rerror], [50, cerror]])
            table_data['score'] = score
            if self.fill is not None:
                table = _fill_spanning(table, fill=self.fill[table_no])
            ar = table.get_list()
            if table_rotation == 'left':
                ar = zip(*ar[::-1])
            elif table_rotation == 'right':
                ar = zip(*ar[::1])
                ar.reverse()
            ar = encode_list(ar)
            table_data['data'] = ar
            empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
--- a/camelot/pdf.py
+++ b/camelot/pdf.py
@ -196,28 +196,28 @@ class Pdf:
            try:
                for tables in self.debug_tables:
                    for table in tables:
-                        for i in range(len(table.cells)):
+                        for r in range(len(table.rows)):
-                            for j in range(len(table.cells[i])):
+                            for c in range(len(table.cols)):
-                                if table.cells[i][j].left:
+                                if table.cells[r][c].left:
-                                    plt.plot([table.cells[i][j].lb[0],
+                                    plt.plot([table.cells[r][c].lb[0],
-                                              table.cells[i][j].lt[0]],
+                                              table.cells[r][c].lt[0]],
-                                             [table.cells[i][j].lb[1],
+                                             [table.cells[r][c].lb[1],
-                                              table.cells[i][j].lt[1]])
+                                              table.cells[r][c].lt[1]])
-                                if table.cells[i][j].right:
+                                if table.cells[r][c].right:
-                                    plt.plot([table.cells[i][j].rb[0],
+                                    plt.plot([table.cells[r][c].rb[0],
-                                              table.cells[i][j].rt[0]],
+                                              table.cells[r][c].rt[0]],
-                                             [table.cells[i][j].rb[1],
+                                             [table.cells[r][c].rb[1],
-                                              table.cells[i][j].rt[1]])
+                                              table.cells[r][c].rt[1]])
-                                if table.cells[i][j].top:
+                                if table.cells[r][c].top:
-                                    plt.plot([table.cells[i][j].lt[0],
+                                    plt.plot([table.cells[r][c].lt[0],
-                                              table.cells[i][j].rt[0]],
+                                              table.cells[r][c].rt[0]],
-                                             [table.cells[i][j].lt[1],
+                                             [table.cells[r][c].lt[1],
-                                              table.cells[i][j].rt[1]])
+                                              table.cells[r][c].rt[1]])
-                                if table.cells[i][j].bottom:
+                                if table.cells[r][c].bottom:
-                                    plt.plot([table.cells[i][j].lb[0],
+                                    plt.plot([table.cells[r][c].lb[0],
-                                              table.cells[i][j].rb[0]],
+                                              table.cells[r][c].rb[0]],
-                                             [table.cells[i][j].lb[1],
+                                             [table.cells[r][c].lb[1],
-                                              table.cells[i][j].rb[1]])
+                                              table.cells[r][c].rb[1]])
                    plt.show()
            except AttributeError:
                raise ValueError("This option only be used with Lattice.")
--- a/camelot/stream.py
+++ b/camelot/stream.py
@ -7,8 +7,9 @@ import copy_reg
 import numpy as np
 from .table import Table
-from .utils import (rotate, get_rotation, text_bbox, get_row_index, get_score,
+from .utils import (rotate, get_rotation, rotate_textlines, text_in_bbox,
-                    count_empty, encode_list, get_text_objects, get_page_layout)
+                    get_table_index, get_score, count_empty, encode_list,
                    get_text_objects, get_page_layout)
 __all__ = ['Stream']
@ -22,6 +23,29 @@ def _reduce_method(m):
 copy_reg.pickle(types.MethodType, _reduce_method)
 def _text_bbox(t_bbox):
    """Returns bounding box for the text present on a page.
    Parameters
    ----------
    t_bbox : dict
        Dict with two keys 'horizontal' and 'vertical' with lists of
        LTTextLineHorizontals and LTTextLineVerticals respectively.
    Returns
    -------
    text_bbox : tuple
        Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate
        space.
    """
    xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
    ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
    xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
    ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
    text_bbox = (xmin, ymin, xmax, ymax)
    return text_bbox
 def _group_rows(text, ytol=2):
    """Groups PDFMiner text objects into rows using their
    y-coordinates taking into account some tolerance ytol.
@ -185,45 +209,6 @@ def _add_columns(cols, text, ytol):
    return cols
 def _get_column_index(t, columns):
    """Gets index of the column in which the given text object lies by
    comparing their x-coordinates.
    Parameters
    ----------
    t : object
    columns : list
        List of column coordinate tuples.
    Returns
    -------
    c_idx : int
    error : float
    """
    offset1, offset2 = 0, 0
    lt_col_overlap = []
    for c in columns:
        if c[0] <= t.x1 and c[1] >= t.x0:
            left = t.x0 if c[0] <= t.x0 else c[0]
            right = t.x1 if c[1] >= t.x1 else c[1]
            lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
        else:
            lt_col_overlap.append(-1)
    if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
        logging.warning("Text doesn't fit any column.")
    c_idx = lt_col_overlap.index(max(lt_col_overlap))
    if t.x0 < columns[c_idx][0]:
        offset1 = abs(t.x0 - columns[c_idx][0])
    if t.x1 > columns[c_idx][1]:
        offset2 = abs(t.x1 - columns[c_idx][1])
    Y = abs(t.y0 - t.y1)
    charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1)
    error = (Y * (offset1 + offset2)) / charea
    return c_idx, error
 class Stream:
    """Stream looks for spaces between text elements to form a table.
@ -265,13 +250,19 @@ class Stream:
        PDFMiner margins. (char_margin, line_margin, word_margin)
        (optional, default: (1.0, 0.5, 0.1))
    split_text : bool
        Whether or not to split a text line if it spans across
        different cells.
        (optional, default: False)
    debug : bool
        Set to True to generate a matplotlib plot of
        LTTextLineHorizontals in order to select table_area, columns.
        (optional, default: False)
    """
    def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
-                 mtol=[0], margins=(1.0, 0.5, 0.1), debug=False):
+                 mtol=[0], margins=(1.0, 0.5, 0.1), split_text=False,
                 debug=False):
        self.method = 'stream'
        self.table_area = table_area
@ -280,6 +271,7 @@ class Stream:
        self.ytol = ytol
        self.mtol = mtol
        self.char_margin, self.line_margin, self.word_margin = margins
        self.split_text = split_text
        self.debug = debug
    def get_tables(self, pdfname):
@ -296,9 +288,9 @@ class Stream:
        """
        layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
            line_margin=self.line_margin, word_margin=self.word_margin)
-        ltchar = get_text_objects(layout, LTType="char")
+        lttextlh = get_text_objects(layout, ltype="lh")
-        lttextlh = get_text_objects(layout, LTType="lh")
+        lttextlv = get_text_objects(layout, ltype="lv")
-        lttextlv = get_text_objects(layout, LTType="lv")
+        ltchar = get_text_objects(layout, ltype="char")
        width, height = dim
        bname, __ = os.path.splitext(pdfname)
        if not lttextlh:
@ -308,6 +300,8 @@ class Stream:
        if self.debug:
            self.debug_text = []
            self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
            self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
        if self.table_area is not None:
            if self.columns is not None:
@ -339,34 +333,16 @@ class Stream:
        for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
            # select elements which lie within table_bbox
            table_data = {}
-            table_rotation = get_rotation(ltchar, lttextlh, lttextlv)
+            lh_bbox = text_in_bbox(k, lttextlh)
-            if table_rotation != '':
+            lv_bbox = text_in_bbox(k, lttextlv)
-                t_bbox = text_bbox(k, lttextlv)
+            char_bbox = text_in_bbox(k, ltchar)
-                if table_rotation == 'left':
+            table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
-                    if self.debug:
+            table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
-                        self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
+            t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
-                    for t in t_bbox:
+            for direction in t_bbox:
-                        x0, y0, x1, y1 = t.bbox
+                t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
-                        x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
+            text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
-                        x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
+            rows_grouped = _group_rows(t_bbox['horizontal'], ytol=self.ytol[table_no])
                        t.set_bbox((x0, y1, x1, y0))
                elif table_rotation == 'right':
                    for t in t_bbox:
                        x0, y0, x1, y1 = t.bbox
                        x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
                        x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
                        t.set_bbox((x1, y0, x0, y1))
            else:
                if self.debug:
                    self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
                t_bbox = text_bbox(k, lttextlh)
            t_bbox.sort(key=lambda x: (-x.y0, x.x0))
            text_x_min = min([t.x0 for t in t_bbox])
            text_y_min = min([t.y0 for t in t_bbox])
            text_x_max = max([t.x1 for t in t_bbox])
            text_y_max = max([t.y1 for t in t_bbox])
            rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no])
            rows = _join_rows(rows_grouped, text_y_max, text_y_min)
            elements = [len(r) for r in rows_grouped]
@ -402,9 +378,9 @@ class Stream:
                    len_non_mode = len(filter(lambda x: x != ncols, elements))
                    if ncols == 1 and not self.debug:
                        # no tables detected
-                        logging.warning("{}: Only one column was detected, the PDF"
+                        logging.warning("{}: Only one column was detected, the pdf"
                                      " may have no tables. Specify ncols if"
-                                      " the PDF has tables.".format(
+                                      " the pdf has tables.".format(
                                      os.path.basename(bname)))
                    cols = [(t.x0, t.x1)
                        for r in rows_grouped if len(r) == ncols for t in r]
@ -413,35 +389,30 @@ class Stream:
                    for i in range(1, len(cols)):
                        left = cols[i - 1][1]
                        right = cols[i][0]
-                        inner_text.extend([t for t in t_bbox if t.x0 > left and t.x1 < right])
+                        inner_text.extend([t for direction in t_bbox
-                    outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
+                                           for t in t_bbox[direction]
                                           if t.x0 > left and t.x1 < right])
                    outer_text = [t for direction in t_bbox
                                  for t in t_bbox[direction]
                                  if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
                    inner_text.extend(outer_text)
                    cols = _add_columns(cols, inner_text, self.ytol[table_no])
                    cols = _join_columns(cols, text_x_min, text_x_max)
            table = Table(cols, rows)
-            rerror = []
+            table = table.set_all_edges()
-            cerror = []
+            assignment_errors = []
-            for row in rows_grouped:
+            for direction in t_bbox:
-                for t in row:
+                for t in t_bbox[direction]:
-                    try:
+                    indices, error = get_table_index(
-                        r_idx, rass_error = get_row_index(t, rows)
+                        table, t, direction, split_text=self.split_text)
-                    except ValueError as e:
+                    assignment_errors.append(error)
-                        # couldn't assign LTTextLH to any cell
+                    for r_idx, c_idx, text in indices:
-                        continue
+                        table.cells[r_idx][c_idx].add_text(text)
                    try:
                        c_idx, cass_error = _get_column_index(t, cols)
                    except ValueError as e:
                        # couldn't assign LTTextLH to any cell
                        continue
                    rerror.append(rass_error)
                    cerror.append(cass_error)
                    table.cells[r_idx][c_idx].add_text(
                        t.get_text().strip('\n'))
            if guess:
-                score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
+                score = get_score([[66, assignment_errors], [34, [len_non_mode / len(elements)]]])
            else:
-                score = get_score([[50, rerror], [50, cerror]])
+                score = get_score([[100, assignment_errors]])
            table_data['score'] = score
            ar = table.get_list()
--- a/camelot/table.py
+++ b/camelot/table.py
@ -35,6 +35,28 @@ class Table:
                       for c in cols] for r in rows]
        self.nocont_ = 0
    def set_all_edges(self):
        """Sets all table edges to True.
        """
        for r in range(len(self.rows)):
            for c in range(len(self.cols)):
                self.cells[r][c].left = True
                self.cells[r][c].right = True
                self.cells[r][c].top = True
                self.cells[r][c].bottom = True
        return self
    def set_border_edges(self):
        """Sets table border edges to True.
        """
        for r in range(len(self.rows)):
            self.cells[r][0].left = True
            self.cells[r][len(self.cols) - 1].right = True
        for c in range(len(self.cols)):
            self.cells[0][c].top = True
            self.cells[len(self.rows) - 1][c].bottom = True
        return self
    def set_edges(self, vertical, horizontal, jtol=2):
        """Sets a cell's edges to True depending on whether they
        overlap with lines found by imgproc.
@ -160,47 +182,47 @@ class Table:
        depending on whether the cell spans/extends horizontally or
        vertically.
        """
-        for i in range(len(self.cells)):
+        for r in range(len(self.rows)):
-            for j in range(len(self.cells[i])):
+            for c in range(len(self.cols)):
-                bound = self.cells[i][j].get_bounded_edges()
+                bound = self.cells[r][c].get_bounded_edges()
                if bound == 4:
                    continue
                elif bound == 3:
-                    if not self.cells[i][j].left:
+                    if not self.cells[r][c].left:
-                        if (self.cells[i][j].right and
+                        if (self.cells[r][c].right and
-                                self.cells[i][j].top and
+                                self.cells[r][c].top and
-                                self.cells[i][j].bottom):
+                                self.cells[r][c].bottom):
-                            self.cells[i][j].spanning_h = True
+                            self.cells[r][c].spanning_h = True
-                    elif not self.cells[i][j].right:
+                    elif not self.cells[r][c].right:
-                        if (self.cells[i][j].left and
+                        if (self.cells[r][c].left and
-                                self.cells[i][j].top and
+                                self.cells[r][c].top and
-                                self.cells[i][j].bottom):
+                                self.cells[r][c].bottom):
-                            self.cells[i][j].spanning_h = True
+                            self.cells[r][c].spanning_h = True
-                    elif not self.cells[i][j].top:
+                    elif not self.cells[r][c].top:
-                        if (self.cells[i][j].left and
+                        if (self.cells[r][c].left and
-                                self.cells[i][j].right and
+                                self.cells[r][c].right and
-                                self.cells[i][j].bottom):
+                                self.cells[r][c].bottom):
-                            self.cells[i][j].spanning_v = True
+                            self.cells[r][c].spanning_v = True
-                    elif not self.cells[i][j].bottom:
+                    elif not self.cells[r][c].bottom:
-                        if (self.cells[i][j].left and
+                        if (self.cells[r][c].left and
-                                self.cells[i][j].right and
+                                self.cells[r][c].right and
-                                self.cells[i][j].top):
+                                self.cells[r][c].top):
-                            self.cells[i][j].spanning_v = True
+                            self.cells[r][c].spanning_v = True
                elif bound == 2:
-                    if self.cells[i][j].left and self.cells[i][j].right:
+                    if self.cells[r][c].left and self.cells[r][c].right:
-                        if (not self.cells[i][j].top and
+                        if (not self.cells[r][c].top and
-                                not self.cells[i][j].bottom):
+                                not self.cells[r][c].bottom):
-                            self.cells[i][j].spanning_v = True
+                            self.cells[r][c].spanning_v = True
-                    elif self.cells[i][j].top and self.cells[i][j].bottom:
+                    elif self.cells[r][c].top and self.cells[r][c].bottom:
-                        if (not self.cells[i][j].left and
+                        if (not self.cells[r][c].left and
-                                not self.cells[i][j].right):
+                                not self.cells[r][c].right):
-                            self.cells[i][j].spanning_h = True
+                            self.cells[r][c].spanning_h = True
        return self
@ -213,7 +235,7 @@ class Table:
        ar : list
        """
        ar = []
-        for i in range(len(self.cells)):
+        for r in range(len(self.rows)):
-            ar.append([self.cells[i][j].get_text().strip()
+            ar.append([self.cells[r][c].get_text().strip()
-                       for j in range(len(self.cells[i]))])
+                       for c in range(len(self.cols))])
        return ar
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1,5 +1,6 @@
 from __future__ import division
 import os
 import logging
 import numpy as np
@ -11,7 +12,8 @@ from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal, LTTextLineVertical
+from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
                             LTTextLineVertical)
 def translate(x1, x2):
@ -174,22 +176,20 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
    return tables_new, v_segments_new, h_segments_new
-def get_rotation(ltchar, lttextlh=None, lttextlv=None):
+def get_rotation(lttextlh, lttextlv, ltchar):
    """Detects if text in table is vertical or not using the current
    transformation matrix (CTM) and returns its orientation.
    Parameters
    ----------
    ltchar : list
        List of PDFMiner LTChar objects.
    lttextlh : list
        List of PDFMiner LTTextLineHorizontal objects.
        (optional, default: None)
    lttextlv : list
        List of PDFMiner LTTextLineVertical objects.
-        (optional, default: None)
+
    ltchar : list
        List of PDFMiner LTChar objects.
    Returns
    -------
@ -199,15 +199,9 @@ def get_rotation(ltchar, lttextlh=None, lttextlv=None):
        anti-clockwise and 'right' if rotated 90 degree clockwise.
    """
    rotation = ''
-    if lttextlh is not None and lttextlv is not None:
+    hlen = len([t for t in lttextlh if t.get_text().strip()])
-        hlen = len([t for t in lttextlh if t.get_text().strip()])
+    vlen = len([t for t in lttextlv if t.get_text().strip()])
-        vlen = len([t for t in lttextlv if t.get_text().strip()])
+    if hlen < vlen:
        vger = 0.0
    else:
        hlen = len([t for t in ltchar if t.upright and t.get_text().strip()])
        vlen = len([t for t in ltchar if (not t.upright) and t.get_text().strip()])
        vger = vlen / float(hlen+vlen)
    if hlen < vlen or vger > 0.8:
        clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
        anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
        rotation = 'left' if clockwise < anticlockwise else 'right'
@ -247,7 +241,183 @@ def segments_bbox(bbox, v_segments, h_segments):
    return v_s, h_s
-def text_bbox(bbox, text):
+def rotate_segments(v_s, h_s, table_rotation):
    """Rotates line segments if the table is rotated.
    Parameters
    ----------
    v : list
        List of vertical line segments.
    h : list
        List of horizontal line segments.
    table_rotation : string
        {'', 'left', 'right'}
    Returns
    -------
    vertical : list
        List of rotated vertical line segments.
    horizontal : list
        List of rotated horizontal line segments.
    """
    vertical, horizontal = [], []
    if table_rotation != '':
        if table_rotation == 'left':
            for v in v_s:
                x0, y0 = rotate(0, 0, v[0], v[1], -np.pi / 2)
                x1, y1 = rotate(0, 0, v[2], v[3], -np.pi / 2)
                horizontal.append((x0, y0, x1, y1))
            for h in h_s:
                x0, y0 = rotate(0, 0, h[0], h[1], -np.pi / 2)
                x1, y1 = rotate(0, 0, h[2], h[3], -np.pi / 2)
                vertical.append((x1, y1, x0, y0))
        elif table_rotation == 'right':
            for v in v_s:
                x0, y0 = rotate(0, 0, v[0], v[1], np.pi / 2)
                x1, y1 = rotate(0, 0, v[2], v[3], np.pi / 2)
                horizontal.append((x1, y1, x0, y0))
            for h in h_s:
                x0, y0 = rotate(0, 0, h[0], h[1], np.pi / 2)
                x1, y1 = rotate(0, 0, h[2], h[3], np.pi / 2)
                vertical.append((x0, y0, x1, y1))
    else:
        vertical = v_s
        horizontal = h_s
    return vertical, horizontal
 def rotate_textlines(lh_bbox, lv_bbox, table_rotation):
    """Rotates bounding boxes of LTTextLineHorizontals and
    LTTextLineVerticals if the table is rotated.
    Parameters
    ----------
    lh_bbox : list
        List of PDFMiner LTTextLineHorizontal objects.
    lv_bbox : list
        List of PDFMiner LTTextLineVertical objects.
    table_rotation : string
        {'', 'left', 'right'}
    Returns
    -------
    t_bbox : dict
        Dict with two keys 'horizontal' and 'vertical' with lists of
        LTTextLineHorizontals and LTTextLineVerticals respectively.
    """
    t_bbox = {}
    if table_rotation != '':
        if table_rotation == 'left':
            for t in lh_bbox:
                x0, y0, x1, y1 = t.bbox
                x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
                x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
                t.set_bbox((x1, y0, x0, y1))
                for obj in t._objs:
                    if isinstance(obj, LTChar):
                        x0, y0, x1, y1 = obj.bbox
                        x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
                        x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
                        obj.set_bbox((x1, y0, x0, y1))
            for t in lv_bbox:
                x0, y0, x1, y1 = t.bbox
                x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
                x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
                t.set_bbox((x0, y1, x1, y0))
                for obj in t._objs:
                    if isinstance(obj, LTChar):
                        x0, y0, x1, y1 = obj.bbox
                        x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
                        x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
                        obj.set_bbox((x0, y1, x1, y0))
        elif table_rotation == 'right':
            for t in lh_bbox:
                x0, y0, x1, y1 = t.bbox
                x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
                x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
                t.set_bbox((x0, y1, x1, y0))
                for obj in t._objs:
                    if isinstance(obj, LTChar):
                        x0, y0, x1, y1 = obj.bbox
                        x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
                        x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
                        obj.set_bbox((x0, y1, x1, y0))
            for t in lv_bbox:
                x0, y0, x1, y1 = t.bbox
                x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
                x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
                t.set_bbox((x1, y0, x0, y1))
                for obj in t._objs:
                    if isinstance(obj, LTChar):
                        x0, y0, x1, y1 = obj.bbox
                        x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
                        x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
                        obj.set_bbox((x1, y0, x0, y1))
        t_bbox['horizontal'] = lv_bbox
        t_bbox['vertical'] = lh_bbox
    else:
        t_bbox['horizontal'] = lh_bbox
        t_bbox['vertical'] = lv_bbox
    return t_bbox
 def rotate_table(R, C, table_rotation):
    """Rotates coordinates of table rows and columns.
    Parameters
    ----------
    R : list
        List of row x-coordinates.
    C : list
        List of column y-coordinates.
    table_rotation : string
        {'', 'left', 'right'}
    Returns
    -------
    rows : list
        List of rotated row x-coordinates.
    cols : list
        List of rotated column y-coordinates.
    """
    rows, cols = [], []
    if table_rotation != '':
        if table_rotation == 'left':
            for r in R:
                r0, r1 = rotate(0, 0, 0, r[0], -np.pi / 2)
                r2, r3 = rotate(0, 0, 0, r[1], -np.pi / 2)
                cols.append((r2, r0))
            cols = sorted(cols)
            for c in C:
                c0, c1 = rotate(0, 0, c[0], 0, -np.pi / 2)
                c2, c3 = rotate(0, 0, c[1], 0, -np.pi / 2)
                rows.append((c1, c3))
        elif table_rotation == 'right':
            for r in R:
                r0, r1 = rotate(0, 0, 0, r[0], np.pi / 2)
                r2, r3 = rotate(0, 0, 0, r[1], np.pi / 2)
                cols.append((r0, r2))
            for c in C:
                c0, c1 = rotate(0, 0, c[0], 0, np.pi / 2)
                c2, c3 = rotate(0, 0, c[1], 0, np.pi / 2)
                rows.append((c3, c1))
            rows = sorted(rows, reverse=True)
    else:
        rows = R
        cols = C
    return rows, cols
 def text_in_bbox(bbox, text):
    """Returns all text objects present inside a
    table's bounding box.
@ -330,66 +500,141 @@ def merge_close_values(ar, mtol=2):
    return ret
-def get_row_index(t, rows):
+def split_textline(table, textline, direction):
-    """Gets index of the row in which the given text object lies by
+    """Splits PDFMiner LTTextLine into substrings if it spans across
-    comparing their y-coordinates.
+    multiple rows/columns.
    Parameters
    ----------
-    t : object
+    table : object
        camelot.pdf.Pdf
-    rows : list
+    textline : object
-        List of row coordinate tuples, sorted in decreasing order.
+        PDFMiner LTTextLine object.
    direction : string
        {'horizontal', 'vertical'}
        Direction of the PDFMiner LTTextLine object.
    Returns
    -------
-    r : int
+    cut_text : list
-
+        List of tuples of the form (idx, text) where idx is the index
-    error : float
+        of row/column and text is the an lttextline substring.
    """
-    offset1, offset2 = 0, 0
+    idx = 0
-    for r in range(len(rows)):
+    cut_text = []
-        if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
+    bbox = textline.bbox
-            if t.y0 > rows[r][0]:
+    if direction == 'horizontal' and not textline.is_empty():
-                offset1 = abs(t.y0 - rows[r][0])
+        x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
-            if t.y1 < rows[r][1]:
+        r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
-                offset2 = abs(t.y1 - rows[r][1])
+        r = r_idx[0]
-            X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
+        x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
-            Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
+        if not x_cuts:
-            charea = X * Y
+            x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
-            error = (X * (offset1 + offset2)) / charea
+        for obj in textline._objs:
-            return r, error
+            row = table.rows[r]
            for cut in x_cuts:
                if isinstance(obj, LTChar):
                    if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
                            (obj.x0 + obj.x1) / 2 <= cut[1]):
                        cut_text.append((r, cut[0], obj.get_text().strip('\n')))
                        break
                elif isinstance(obj, LTAnno):
                    cut_text.append((r, cut[0], obj.get_text().strip('\n')))
    elif direction == 'vertical' and not textline.is_empty():
        y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
        c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
        c = c_idx[0]
        y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
        if not y_cuts:
            y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
        for obj in textline._objs:
            col = table.cols[c]
            for cut in y_cuts:
                if isinstance(obj, LTChar):
                    if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
                            (obj.y0 + obj.y1) / 2 >= cut[1]):
                        cut_text.append((cut[0], c, obj.get_text()))
                        break
                elif isinstance(obj, LTAnno):
                    cut_text.append((cut[0], c, obj.get_text().strip('\n')))
    return cut_text
-def get_column_index(t, columns):
+def get_table_index(table, t, direction, split_text=False):
-    """Gets index of the column in which the given text object lies by
+    """Gets indices of the cell where given text object lies by
-    comparing their x-coordinates.
+    comparing their y and x-coordinates.
    Parameters
    ----------
-    t : object
+    table : object
        camelot.table.Table
-    columns : list
+    t : object
-        List of column coordinate tuples.
+        PDFMiner LTTextLine object.
    direction : string
        {'horizontal', 'vertical'}
        Direction of the PDFMiner LTTextLine object.
    split_text : bool
        Whether or not to split a text line if it spans across
        multiple cells.
        (optional, default: False)
    Returns
    -------
-    c : int
+    indices : list
        List of tuples of the form (idx, text) where idx is the index
        of row/column and text is the an lttextline substring.
    error : float
        Assignment error, percentage of text area that lies outside
        a cell.
        +-------+
        |       |
        |   [Text bounding box]
        |       |
        +-------+
    """
-    offset1, offset2 = 0, 0
+    r_idx, c_idx = [-1] * 2
-    for c in range(len(columns)):
+    for r in range(len(table.rows)):
-        if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
+        if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
-            if t.x0 < columns[c][0]:
+                (t.y0 + t.y1) / 2.0 > table.rows[r][1]):
-                offset1 = abs(t.x0 - columns[c][0])
+            lt_col_overlap = []
-            if t.x1 > columns[c][1]:
+            for c in table.cols:
-                offset2 = abs(t.x1 - columns[c][1])
+                if c[0] <= t.x1 and c[1] >= t.x0:
-            X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
+                    left = t.x0 if c[0] <= t.x0 else c[0]
-            Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
+                    right = t.x1 if c[1] >= t.x1 else c[1]
-            charea = X * Y
+                    lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
-            error = (Y * (offset1 + offset2)) / charea
+                else:
-            return c, error
+                    lt_col_overlap.append(-1)
            if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
                logging.warning("Text doesn't fit any column.")
            r_idx = r
            c_idx = lt_col_overlap.index(max(lt_col_overlap))
            break
    # error calculation
    y0_offset, y1_offset, x0_offset, x1_offset = [0] * 4
    if t.y0 > table.rows[r_idx][0]:
        y0_offset = abs(t.y0 - table.rows[r_idx][0])
    if t.y1 < table.rows[r_idx][1]:
        y1_offset = abs(t.y1 - table.rows[r_idx][1])
    if t.x0 < table.cols[c_idx][0]:
        x0_offset = abs(t.x0 - table.cols[c_idx][0])
    if t.x1 > table.cols[c_idx][1]:
        x1_offset = abs(t.x1 - table.cols[c_idx][1])
    X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
    Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
    charea = X * Y
    error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
    if split_text:
        return split_textline(table, t, direction), error
    else:
        return [(r_idx, c_idx, t.get_text().strip('\n'))], error
 def get_score(error_weights):
@ -448,9 +693,14 @@ def count_empty(d):
    Returns
    -------
-    n_empty_rows : number of empty rows
+    n_empty_rows : list
-    n_empty_cols : number of empty columns
+        Number of empty rows.
-    empty_p : percentage of empty cells
+
    n_empty_cols : list
        Number of empty columns.
    empty_p : float
        Percentage of empty cells.
    """
    empty_p = 0
    r_nempty_cells, c_nempty_cells = [], []
@ -491,7 +741,7 @@ def encode_list(ar):
    return ar
-def get_text_objects(layout, LTType="char", t=None):
+def get_text_objects(layout, ltype="char", t=None):
    """Recursively parses pdf layout to get a list of
    text objects.
@ -500,7 +750,7 @@ def get_text_objects(layout, LTType="char", t=None):
    layout : object
        PDFMiner LTPage object.
-    LTType : string
+    ltype : string
        {'char', 'lh', 'lv'}
        Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
        and LTTextLineVertical objects respectively.
@ -512,11 +762,11 @@ def get_text_objects(layout, LTType="char", t=None):
    t : list
        List of PDFMiner text objects.
    """
-    if LTType == "char":
+    if ltype == "char":
        LTObject = LTChar
-    elif LTType == "lh":
+    elif ltype == "lh":
        LTObject = LTTextLineHorizontal
-    elif LTType == "lv":
+    elif ltype == "lv":
        LTObject = LTTextLineVertical
    if t is None:
        t = []
@ -525,7 +775,7 @@ def get_text_objects(layout, LTType="char", t=None):
            if isinstance(obj, LTObject):
                t.append(obj)
            else:
-                t += get_text_objects(obj, LTType=LTType)
+                t += get_text_objects(obj, ltype=ltype)
    except AttributeError:
        pass
    return t
--- a/tests/test_lattice.py
+++ b/tests/test_lattice.py
@ -32,7 +32,7 @@ def test_lattice_basic():
 def test_lattice_fill():
    data = [
-        ["Plan Type","County","Plan  Name","Totals"],
+        ["Plan Type","County","Plan Name","Totals"],
        ["GMC","Sacramento","Anthem Blue Cross","164,380"],
        ["GMC","Sacramento","Health Net","126,547"],
        ["GMC","Sacramento","Kaiser Foundation","74,620"],
@ -122,46 +122,4 @@ def test_lattice_table_rotation():
    pdfname = os.path.join(testdir, 'right_rotated_table_1.pdf')
    manager = Pdf(Lattice(), pdfname, clean=True)
    tables = manager.extract()
    assert_equal(tables['page-1']['table-1']['data'], data)
 def test_lattice_cell_rotation():
    data = [
        ["Sl.No.","District","Projected Population for 2012-13(In lakhs)","Adult Equivalent  to 88%(In lakhs)","Total Consumptionrequirement(@ 400gms/adult/day)(In Lakh tonnes)","Total Requirement(Including seeds, feeds & wastage)(In Lakh tonnes)","Production (Rice)(In Lakh tonnes)","","","Surplus/Deﬁ cit(In Lakh tonnes)",""],
        ["","","","","","","Kharif","Rabi","Total","Rice","Paddy"],
        ["1","Balasore","23.65","20.81","3.04","3.47","2.78","0.86","3.64","0.17","0.25"],
        ["2","Bhadrak","15.34","13.50","1.97","2.25","3.50","0.05","3.55","1.30","1.94"],
        ["3","Balangir","17.01","14.97","2.19","2.50","6.23","0.10","6.33","3.83","5.72"],
        ["4","Subarnapur","6.70","5.90","0.86","0.98","4.48","1.13","5.61","4.63","6.91"],
        ["5","Cuttack","26.63","23.43","3.42","3.91","3.75","0.06","3.81","-0.10","-0.15"],
        ["6","Jagatsingpur","11.49","10.11","1.48","1.69","2.10","0.02","2.12","0.43","0.64"],
        ["7","Jajpur","18.59","16.36","2.39","2.73","2.13","0.04","2.17","-0.56","-0.84"],
        ["8","Kendrapara","14.62","12.87","1.88","2.15","2.60","0.07","2.67","0.52","0.78"],
        ["9","Dhenkanal","12.13","10.67","1.56","1.78","2.26","0.02","2.28","0.50","0.75"],
        ["10","Angul","12.93","11.38","1.66","1.90","1.73","0.02","1.75","-0.15","-0.22"],
        ["11","Ganjam","35.77","31.48","4.60","5.26","4.57","0.00","4.57","-0.69","-1.03"],
        ["12","Gajapati","5.85","5.15","0.75","0.86","0.68","0.01","0.69","-0.17","-0.25"],
        ["13","Kalahandi","16.12","14.19","2.07","2.37","5.42","1.13","6.55","4.18","6.24"],
        ["14","Nuapada","6.18","5.44","0.79","0.90","1.98","0.08","2.06","1.16","1.73"],
        ["15","Keonjhar","18.42","16.21","2.37","2.71","2.76","0.08","2.84","0.13","0.19"],
        ["16","Koraput","14.09","12.40","1.81","2.07","2.08","0.34","2.42","0.35","0.52"],
        ["17","Malkangiri","6.31","5.55","0.81","0.93","1.78","0.04","1.82","0.89","1.33"],
        ["18","Nabarangpur","12.50","11.00","1.61","1.84","3.26","0.02","3.28","1.44","2.15"],
        ["19","Rayagada","9.83","8.65","1.26","1.44","1.15","0.03","1.18","-0.26","-0.39"],
        ["20","Mayurbhanj","25.61","22.54","3.29","3.76","4.90","0.06","4.96","1.20","1.79"],
        ["21","Kandhamal","7.45","6.56","0.96","1.10","0.70","0.01","0.71","-0.39","-0.58"],
        ["22","Boudh","4.51","3.97","0.58","0.66","1.73","0.03","1.76","1.10","1.64"],
        ["23","Puri","17.29","15.22","2.22","2.54","2.45","0.99","3.44","0.90","1.34"],
        ["24","Khordha","23.08","20.31","2.97","3.39","2.02","0.03","2.05","-1.34","-2.00"],
        ["25","Nayagarh","9.78","8.61","1.26","1.44","2.10","0.00","2.10","0.66","0.99"],
        ["26","Sambalpur","10.62","9.35","1.37","1.57","3.45","0.71","4.16","2.59","3.87"],
        ["27","Bargarh","15.00","13.20","1.93","2.21","6.87","2.65","9.52","7.31","10.91"],
        ["28","Deogarh","3.18","2.80","0.41","0.47","1.12","0.07","1.19","0.72","1.07"],
        ["29","Jharsuguda","5.91","5.20","0.76","0.87","0.99","0.01","1.00","0.13","0.19"],
        ["30","Sundargarh","21.21","18.66","2.72","3.11","4.72","0.02","4.74","1.63","2.43"],
        ["ODISHA","","427.80","376.49","54.99","62.86","86.29","8.68","94.97","32.11","47.92"]
    ]
    pdfname = os.path.join(testdir, 'agstat.pdf')
    manager = Pdf(Lattice(), pdfname, clean=True)
    tables = manager.extract()
    assert_equal(tables['page-1']['table-1']['data'], data)
--- a/tests/test_stream.py
+++ b/tests/test_stream.py
@ -169,45 +169,45 @@ def test_stream_columns():
 def test_stream_table_rotation():
    data = [
-        ["Table 21  Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","","",""],
+        ["","","Table 21  Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","",""],
-        ["","","","","","Modern method","","","","","","","Traditional method","","","",""],
+        ["","","","","","","Modern method","","","","","","","Traditional method","","","",""],
-        ["","","Any","","","","","","","Other","Any","","","","Not","","Number"],
+        ["","","","Any","","","","","","","Other","Any","","","","Not","","Number"],
-        ["","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"],
+        ["","","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"],
-        ["Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"],
+        ["","Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"],
-        ["Caste/tribe","","","","","","","","","","","","","","","",""],
+        ["","Caste/tribe","","","","","","","","","","","","","","","",""],
-        ["Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"],
+        ["","Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"],
-        ["Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"],
+        ["","Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"],
-        ["Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"],
+        ["","Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"],
-        ["Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"],
+        ["","Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"],
-        ["Wealth index","","","","","","","","","","","","","","","",""],
+        ["","Wealth index","","","","","","","","","","","","","","","",""],
-        ["Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"],
+        ["","Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"],
-        ["Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"],
+        ["","Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"],
-        ["Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"],
+        ["","Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"],
-        ["Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"],
+        ["","Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"],
-        ["Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"],
+        ["","Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"],
-        ["Number of living children","","","","","","","","","","","","","","","",""],
+        ["","Number of living children","","","","","","","","","","","","","","","",""],
-        ["No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"],
+        ["","No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"],
-        ["1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"],
+        ["","1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"],
-        ["1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"],
+        ["","1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"],
-        ["No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"],
+        ["","No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"],
-        ["2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"],
+        ["","2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"],
-        ["1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"],
+        ["","1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"],
-        ["No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"],
+        ["","No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"],
-        ["3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"],
+        ["","3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"],
-        ["1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"],
+        ["","1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"],
-        ["No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"],
+        ["","No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"],
-        ["4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"],
+        ["","4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"],
-        ["1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"],
+        ["","1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"],
-        ["No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"],
+        ["","No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"],
-        ["Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"],
+        ["","Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"],
-        ["NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"],
+        ["","NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"],
-        ["NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"],
+        ["","NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"],
-        ["","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""],
+        ["","","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""],
-        ["not shown separately.","","","","","","","","","","","","","","","",""],
+        ["","not shown separately.","","","","","","","","","","","","","","","",""],
-        ["na = Not available","","","","","","","","","","","","","","","",""],
+        ["","na = Not available","","","","","","","","","","","","","","","",""],
-        ["","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""],
+        ["","","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""],
-        ["( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""],
+        ["","( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""],
-        ["","","","","","","","54","","","","","","","","",""]
+        ["","","","","","","","","54","","","","","","","","",""]
    ]
    pdfname = os.path.join(testdir, "left_rotated_table_2.pdf")
    manager = Pdf(Stream(), pdfname, clean=True)