From a43d5ca2c76269f4e77d1517f9114b8a43286f69 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Wed, 12 Oct 2016 13:17:02 +0530 Subject: [PATCH] Replace chars with textlines * Add split function * Add split_text and shift_text params * Change get_rotation * Move get_column_index to utils * Add split_text and shift_text * Fix split_text --- camelot/lattice.py | 215 ++++++++++-------------- camelot/pdf.py | 44 ++--- camelot/stream.py | 165 ++++++++---------- camelot/table.py | 90 ++++++---- camelot/utils.py | 382 ++++++++++++++++++++++++++++++++++-------- tests/test_lattice.py | 44 +---- tests/test_stream.py | 78 ++++----- 7 files changed, 590 insertions(+), 428 deletions(-) diff --git a/camelot/lattice.py b/camelot/lattice.py index 79a6cf7..dc34b28 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -8,10 +8,10 @@ import subprocess from .imgproc import (adaptive_threshold, find_lines, find_table_contours, find_table_joints) from .table import Table -from .utils import (scale_to_pdf, scale_to_image, get_rotation, segments_bbox, - text_bbox, merge_close_values, get_row_index, - get_column_index, get_score, count_empty, encode_list, - get_text_objects, get_page_layout) +from .utils import (scale_to_pdf, scale_to_image, get_rotation, rotate_segments, + rotate_textlines, rotate_table, segments_bbox, text_in_bbox, + merge_close_values, get_table_index, get_score, count_empty, + encode_list, get_text_objects, get_page_layout) __all__ = ['Lattice'] @@ -25,6 +25,52 @@ def _reduce_method(m): copy_reg.pickle(types.MethodType, _reduce_method) +def _reduce_index(t, idx, shift_text): + """Reduces index of a text object if it lies within a spanning + cell taking in account table rotation. + + Parameters + ---------- + table : object + camelot.table.Table + + idx : list + List of tuples of the form (r_idx, c_idx, text). + + shift_text : list + {'l', 'r', 't', 'b'} + Select one or more from above and pass them as a list to + specify where the text in a spanning cell should flow. + + Returns + ------- + indices : list + List of tuples of the form (idx, text) where idx is the reduced + index of row/column and text is the an lttextline substring. + """ + indices = [] + for r_idx, c_idx, text in idx: + for d in shift_text: + if d == 'l': + if t.cells[r_idx][c_idx].spanning_h: + while not t.cells[r_idx][c_idx].left: + c_idx -= 1 + if d == 'r': + if t.cells[r_idx][c_idx].spanning_h: + while not t.cells[r_idx][c_idx].right: + c_idx += 1 + if d == 't': + if t.cells[r_idx][c_idx].spanning_v: + while not t.cells[r_idx][c_idx].top: + r_idx -= 1 + if d == 'b': + if t.cells[r_idx][c_idx].spanning_v: + while not t.cells[r_idx][c_idx].bottom: + r_idx += 1 + indices.append((r_idx, c_idx, text)) + return indices + + def _fill_spanning(t, fill=None): """Fills spanning cells. @@ -67,78 +113,6 @@ def _fill_spanning(t, fill=None): return t -def _outline(t): - """Sets table border edges to True. - - Parameters - ---------- - t : object - camelot.table.Table - - Returns - ------- - t : object - camelot.table.Table - """ - for i in range(len(t.cells)): - t.cells[i][0].left = True - t.cells[i][len(t.cells[i]) - 1].right = True - for i in range(len(t.cells[0])): - t.cells[0][i].top = True - t.cells[len(t.cells) - 1][i].bottom = True - return t - - -def _reduce_index(t, rotation, r_idx, c_idx): - """Reduces index of a text object if it lies within a spanning - cell taking in account table rotation. - - Parameters - ---------- - t : object - camelot.table.Table - - rotation : string - {'', 'left', 'right'} - - r_idx : int - Current row index. - - c_idx : int - Current column index. - - Returns - ------- - r_idx : int - Reduced row index. - - c_idx : int - Reduced column index. - """ - if not rotation: - if t.cells[r_idx][c_idx].spanning_h: - while not t.cells[r_idx][c_idx].left: - c_idx -= 1 - if t.cells[r_idx][c_idx].spanning_v: - while not t.cells[r_idx][c_idx].top: - r_idx -= 1 - elif rotation == 'left': - if t.cells[r_idx][c_idx].spanning_h: - while not t.cells[r_idx][c_idx].left: - c_idx -= 1 - if t.cells[r_idx][c_idx].spanning_v: - while not t.cells[r_idx][c_idx].bottom: - r_idx += 1 - elif rotation == 'right': - if t.cells[r_idx][c_idx].spanning_h: - while not t.cells[r_idx][c_idx].right: - c_idx += 1 - if t.cells[r_idx][c_idx].spanning_v: - while not t.cells[r_idx][c_idx].top: - r_idx -= 1 - return r_idx, c_idx - - class Lattice: """Lattice looks for lines in the pdf to form a table. @@ -179,6 +153,17 @@ class Lattice: PDFMiner margins. (char_margin, line_margin, word_margin) (optional, default: (1.0, 0.5, 0.1)) + split_text : bool + Whether or not to split a text line if it spans across + different cells. + (optional, default: False) + + shift_text : list + {'l', 'r', 't', 'b'} + Select one or more from above and pass them as a list to + specify where the text in a spanning cell should flow. + (optional, default: ['l', 't']) + debug : string {'contour', 'line', 'joint', 'table'} Set to one of the above values to generate a matplotlib plot @@ -186,7 +171,8 @@ class Lattice: (optional, default: None) """ def __init__(self, table_area=None, fill=None, mtol=[2], scale=15, - invert=False, margins=(1.0, 0.5, 0.1), debug=None): + invert=False, margins=(1.0, 0.5, 0.1), split_text=False, + shift_text=['l', 't'], debug=None): self.method = 'lattice' self.table_area = table_area @@ -195,6 +181,8 @@ class Lattice: self.scale = scale self.invert = invert self.char_margin, self.line_margin, self.word_margin = margins + self.split_text = split_text + self.shift_text = shift_text self.debug = debug def get_tables(self, pdfname): @@ -211,9 +199,9 @@ class Lattice: """ layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, line_margin=self.line_margin, word_margin=self.word_margin) - ltchar = get_text_objects(layout, LTType="char") - lttextlh = get_text_objects(layout, LTType="lh") - lttextlv = get_text_objects(layout, LTType="lv") + lttextlh = get_text_objects(layout, ltype="lh") + lttextlv = get_text_objects(layout, ltype="lv") + ltchar = get_text_objects(layout, ltype="char") width, height = dim bname, __ = os.path.splitext(pdfname) if not ltchar: @@ -287,11 +275,15 @@ class Lattice: # select elements which lie within table_bbox table_data = {} v_s, h_s = segments_bbox(k, v_segments, h_segments) - char_bbox = text_bbox(k, ltchar) - lh_bbox = text_bbox(k, lttextlh) - lv_bbox = text_bbox(k, lttextlv) + lh_bbox = text_in_bbox(k, lttextlh) + lv_bbox = text_in_bbox(k, lttextlv) + char_bbox = text_in_bbox(k, ltchar) table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar))) - table_rotation = get_rotation(char_bbox, lh_bbox, lv_bbox) + table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox) + v_s, h_s = rotate_segments(v_s, h_s, table_rotation) + t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation) + for direction in t_bbox: + t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) cols, rows = zip(*table_bbox[k]) cols, rows = list(cols), list(rows) cols.extend([k[0], k[2]]) @@ -305,6 +297,7 @@ class Lattice: for i in range(0, len(cols) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] + rows, cols = rotate_table(rows, cols, table_rotation) table = Table(cols, rows) # set table edges to True using ver+hor lines table = table.set_edges(v_s, h_s) @@ -313,58 +306,26 @@ class Lattice: # set spanning cells to True table = table.set_spanning() # set table border edges to True - table = _outline(table) + table = table.set_border_edges() if self.debug: self.debug_tables.append(table) - rerror = [] - cerror = [] - for t in char_bbox: - try: - r_idx, rass_error = get_row_index(t, rows) - except TypeError: - # couldn't assign LTChar to any cell - continue - try: - c_idx, cass_error = get_column_index(t, cols) - except TypeError: - # couldn't assign LTChar to any cell - continue - rerror.append(rass_error) - cerror.append(cass_error) - r_idx, c_idx = _reduce_index(table, table_rotation, r_idx, c_idx) - table.cells[r_idx][c_idx].add_object(t) - - for i in range(len(table.cells)): - for j in range(len(table.cells[i])): - t_bbox = table.cells[i][j].get_objects() - try: - cell_rotation = get_rotation(t_bbox) - except ZeroDivisionError: - cell_rotation = '' - pass - # fill text after sorting it - if cell_rotation == '': - t_bbox.sort(key=lambda x: (-x.y0, x.x0)) - elif cell_rotation == 'left': - t_bbox.sort(key=lambda x: (x.x0, x.y0)) - elif cell_rotation == 'right': - t_bbox.sort(key=lambda x: (-x.x0, -x.y0)) - table.cells[i][j].add_text(''.join([t.get_text() - for t in t_bbox])) - - score = get_score([[50, rerror], [50, cerror]]) + assignment_errors = [] + for direction in t_bbox: + for t in t_bbox[direction]: + indices, error = get_table_index( + table, t, direction, split_text=self.split_text) + assignment_errors.append(error) + indices = _reduce_index(table, indices, shift_text=self.shift_text) + for r_idx, c_idx, text in indices: + table.cells[r_idx][c_idx].add_text(text) + score = get_score([[100, assignment_errors]]) table_data['score'] = score if self.fill is not None: table = _fill_spanning(table, fill=self.fill[table_no]) ar = table.get_list() - if table_rotation == 'left': - ar = zip(*ar[::-1]) - elif table_rotation == 'right': - ar = zip(*ar[::1]) - ar.reverse() ar = encode_list(ar) table_data['data'] = ar empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) diff --git a/camelot/pdf.py b/camelot/pdf.py index 4bd9475..e1bfdb2 100644 --- a/camelot/pdf.py +++ b/camelot/pdf.py @@ -196,28 +196,28 @@ class Pdf: try: for tables in self.debug_tables: for table in tables: - for i in range(len(table.cells)): - for j in range(len(table.cells[i])): - if table.cells[i][j].left: - plt.plot([table.cells[i][j].lb[0], - table.cells[i][j].lt[0]], - [table.cells[i][j].lb[1], - table.cells[i][j].lt[1]]) - if table.cells[i][j].right: - plt.plot([table.cells[i][j].rb[0], - table.cells[i][j].rt[0]], - [table.cells[i][j].rb[1], - table.cells[i][j].rt[1]]) - if table.cells[i][j].top: - plt.plot([table.cells[i][j].lt[0], - table.cells[i][j].rt[0]], - [table.cells[i][j].lt[1], - table.cells[i][j].rt[1]]) - if table.cells[i][j].bottom: - plt.plot([table.cells[i][j].lb[0], - table.cells[i][j].rb[0]], - [table.cells[i][j].lb[1], - table.cells[i][j].rb[1]]) + for r in range(len(table.rows)): + for c in range(len(table.cols)): + if table.cells[r][c].left: + plt.plot([table.cells[r][c].lb[0], + table.cells[r][c].lt[0]], + [table.cells[r][c].lb[1], + table.cells[r][c].lt[1]]) + if table.cells[r][c].right: + plt.plot([table.cells[r][c].rb[0], + table.cells[r][c].rt[0]], + [table.cells[r][c].rb[1], + table.cells[r][c].rt[1]]) + if table.cells[r][c].top: + plt.plot([table.cells[r][c].lt[0], + table.cells[r][c].rt[0]], + [table.cells[r][c].lt[1], + table.cells[r][c].rt[1]]) + if table.cells[r][c].bottom: + plt.plot([table.cells[r][c].lb[0], + table.cells[r][c].rb[0]], + [table.cells[r][c].lb[1], + table.cells[r][c].rb[1]]) plt.show() except AttributeError: raise ValueError("This option only be used with Lattice.") diff --git a/camelot/stream.py b/camelot/stream.py index bf5260e..dfc74b6 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -7,8 +7,9 @@ import copy_reg import numpy as np from .table import Table -from .utils import (rotate, get_rotation, text_bbox, get_row_index, get_score, - count_empty, encode_list, get_text_objects, get_page_layout) +from .utils import (rotate, get_rotation, rotate_textlines, text_in_bbox, + get_table_index, get_score, count_empty, encode_list, + get_text_objects, get_page_layout) __all__ = ['Stream'] @@ -22,6 +23,29 @@ def _reduce_method(m): copy_reg.pickle(types.MethodType, _reduce_method) +def _text_bbox(t_bbox): + """Returns bounding box for the text present on a page. + + Parameters + ---------- + t_bbox : dict + Dict with two keys 'horizontal' and 'vertical' with lists of + LTTextLineHorizontals and LTTextLineVerticals respectively. + + Returns + ------- + text_bbox : tuple + Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate + space. + """ + xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) + ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]]) + xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]]) + ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]]) + text_bbox = (xmin, ymin, xmax, ymax) + return text_bbox + + def _group_rows(text, ytol=2): """Groups PDFMiner text objects into rows using their y-coordinates taking into account some tolerance ytol. @@ -185,45 +209,6 @@ def _add_columns(cols, text, ytol): return cols -def _get_column_index(t, columns): - """Gets index of the column in which the given text object lies by - comparing their x-coordinates. - - Parameters - ---------- - t : object - - columns : list - List of column coordinate tuples. - - Returns - ------- - c_idx : int - - error : float - """ - offset1, offset2 = 0, 0 - lt_col_overlap = [] - for c in columns: - if c[0] <= t.x1 and c[1] >= t.x0: - left = t.x0 if c[0] <= t.x0 else c[0] - right = t.x1 if c[1] >= t.x1 else c[1] - lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1])) - else: - lt_col_overlap.append(-1) - if len(filter(lambda x: x != -1, lt_col_overlap)) == 0: - logging.warning("Text doesn't fit any column.") - c_idx = lt_col_overlap.index(max(lt_col_overlap)) - if t.x0 < columns[c_idx][0]: - offset1 = abs(t.x0 - columns[c_idx][0]) - if t.x1 > columns[c_idx][1]: - offset2 = abs(t.x1 - columns[c_idx][1]) - Y = abs(t.y0 - t.y1) - charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1) - error = (Y * (offset1 + offset2)) / charea - return c_idx, error - - class Stream: """Stream looks for spaces between text elements to form a table. @@ -265,13 +250,19 @@ class Stream: PDFMiner margins. (char_margin, line_margin, word_margin) (optional, default: (1.0, 0.5, 0.1)) + split_text : bool + Whether or not to split a text line if it spans across + different cells. + (optional, default: False) + debug : bool Set to True to generate a matplotlib plot of LTTextLineHorizontals in order to select table_area, columns. (optional, default: False) """ def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2], - mtol=[0], margins=(1.0, 0.5, 0.1), debug=False): + mtol=[0], margins=(1.0, 0.5, 0.1), split_text=False, + debug=False): self.method = 'stream' self.table_area = table_area @@ -280,6 +271,7 @@ class Stream: self.ytol = ytol self.mtol = mtol self.char_margin, self.line_margin, self.word_margin = margins + self.split_text = split_text self.debug = debug def get_tables(self, pdfname): @@ -296,9 +288,9 @@ class Stream: """ layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, line_margin=self.line_margin, word_margin=self.word_margin) - ltchar = get_text_objects(layout, LTType="char") - lttextlh = get_text_objects(layout, LTType="lh") - lttextlv = get_text_objects(layout, LTType="lv") + lttextlh = get_text_objects(layout, ltype="lh") + lttextlv = get_text_objects(layout, ltype="lv") + ltchar = get_text_objects(layout, ltype="char") width, height = dim bname, __ = os.path.splitext(pdfname) if not lttextlh: @@ -308,6 +300,8 @@ class Stream: if self.debug: self.debug_text = [] + self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh]) + self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv]) if self.table_area is not None: if self.columns is not None: @@ -339,34 +333,16 @@ class Stream: for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): # select elements which lie within table_bbox table_data = {} - table_rotation = get_rotation(ltchar, lttextlh, lttextlv) - if table_rotation != '': - t_bbox = text_bbox(k, lttextlv) - if table_rotation == 'left': - if self.debug: - self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv]) - for t in t_bbox: - x0, y0, x1, y1 = t.bbox - x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2) - x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2) - t.set_bbox((x0, y1, x1, y0)) - elif table_rotation == 'right': - for t in t_bbox: - x0, y0, x1, y1 = t.bbox - x0, y0 = rotate(0, 0, x0, y0, np.pi / 2) - x1, y1 = rotate(0, 0, x1, y1, np.pi / 2) - t.set_bbox((x1, y0, x0, y1)) - else: - if self.debug: - self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh]) - t_bbox = text_bbox(k, lttextlh) - t_bbox.sort(key=lambda x: (-x.y0, x.x0)) - - text_x_min = min([t.x0 for t in t_bbox]) - text_y_min = min([t.y0 for t in t_bbox]) - text_x_max = max([t.x1 for t in t_bbox]) - text_y_max = max([t.y1 for t in t_bbox]) - rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no]) + lh_bbox = text_in_bbox(k, lttextlh) + lv_bbox = text_in_bbox(k, lttextlv) + char_bbox = text_in_bbox(k, ltchar) + table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar))) + table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox) + t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation) + for direction in t_bbox: + t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) + text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox) + rows_grouped = _group_rows(t_bbox['horizontal'], ytol=self.ytol[table_no]) rows = _join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] @@ -402,9 +378,9 @@ class Stream: len_non_mode = len(filter(lambda x: x != ncols, elements)) if ncols == 1 and not self.debug: # no tables detected - logging.warning("{}: Only one column was detected, the PDF" + logging.warning("{}: Only one column was detected, the pdf" " may have no tables. Specify ncols if" - " the PDF has tables.".format( + " the pdf has tables.".format( os.path.basename(bname))) cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] @@ -413,35 +389,30 @@ class Stream: for i in range(1, len(cols)): left = cols[i - 1][1] right = cols[i][0] - inner_text.extend([t for t in t_bbox if t.x0 > left and t.x1 < right]) - outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] + inner_text.extend([t for direction in t_bbox + for t in t_bbox[direction] + if t.x0 > left and t.x1 < right]) + outer_text = [t for direction in t_bbox + for t in t_bbox[direction] + if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] inner_text.extend(outer_text) cols = _add_columns(cols, inner_text, self.ytol[table_no]) cols = _join_columns(cols, text_x_min, text_x_max) table = Table(cols, rows) - rerror = [] - cerror = [] - for row in rows_grouped: - for t in row: - try: - r_idx, rass_error = get_row_index(t, rows) - except ValueError as e: - # couldn't assign LTTextLH to any cell - continue - try: - c_idx, cass_error = _get_column_index(t, cols) - except ValueError as e: - # couldn't assign LTTextLH to any cell - continue - rerror.append(rass_error) - cerror.append(cass_error) - table.cells[r_idx][c_idx].add_text( - t.get_text().strip('\n')) + table = table.set_all_edges() + assignment_errors = [] + for direction in t_bbox: + for t in t_bbox[direction]: + indices, error = get_table_index( + table, t, direction, split_text=self.split_text) + assignment_errors.append(error) + for r_idx, c_idx, text in indices: + table.cells[r_idx][c_idx].add_text(text) if guess: - score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]]) + score = get_score([[66, assignment_errors], [34, [len_non_mode / len(elements)]]]) else: - score = get_score([[50, rerror], [50, cerror]]) + score = get_score([[100, assignment_errors]]) table_data['score'] = score ar = table.get_list() diff --git a/camelot/table.py b/camelot/table.py index 7b84a86..0978e7e 100644 --- a/camelot/table.py +++ b/camelot/table.py @@ -35,6 +35,28 @@ class Table: for c in cols] for r in rows] self.nocont_ = 0 + def set_all_edges(self): + """Sets all table edges to True. + """ + for r in range(len(self.rows)): + for c in range(len(self.cols)): + self.cells[r][c].left = True + self.cells[r][c].right = True + self.cells[r][c].top = True + self.cells[r][c].bottom = True + return self + + def set_border_edges(self): + """Sets table border edges to True. + """ + for r in range(len(self.rows)): + self.cells[r][0].left = True + self.cells[r][len(self.cols) - 1].right = True + for c in range(len(self.cols)): + self.cells[0][c].top = True + self.cells[len(self.rows) - 1][c].bottom = True + return self + def set_edges(self, vertical, horizontal, jtol=2): """Sets a cell's edges to True depending on whether they overlap with lines found by imgproc. @@ -160,47 +182,47 @@ class Table: depending on whether the cell spans/extends horizontally or vertically. """ - for i in range(len(self.cells)): - for j in range(len(self.cells[i])): - bound = self.cells[i][j].get_bounded_edges() + for r in range(len(self.rows)): + for c in range(len(self.cols)): + bound = self.cells[r][c].get_bounded_edges() if bound == 4: continue elif bound == 3: - if not self.cells[i][j].left: - if (self.cells[i][j].right and - self.cells[i][j].top and - self.cells[i][j].bottom): - self.cells[i][j].spanning_h = True + if not self.cells[r][c].left: + if (self.cells[r][c].right and + self.cells[r][c].top and + self.cells[r][c].bottom): + self.cells[r][c].spanning_h = True - elif not self.cells[i][j].right: - if (self.cells[i][j].left and - self.cells[i][j].top and - self.cells[i][j].bottom): - self.cells[i][j].spanning_h = True + elif not self.cells[r][c].right: + if (self.cells[r][c].left and + self.cells[r][c].top and + self.cells[r][c].bottom): + self.cells[r][c].spanning_h = True - elif not self.cells[i][j].top: - if (self.cells[i][j].left and - self.cells[i][j].right and - self.cells[i][j].bottom): - self.cells[i][j].spanning_v = True + elif not self.cells[r][c].top: + if (self.cells[r][c].left and + self.cells[r][c].right and + self.cells[r][c].bottom): + self.cells[r][c].spanning_v = True - elif not self.cells[i][j].bottom: - if (self.cells[i][j].left and - self.cells[i][j].right and - self.cells[i][j].top): - self.cells[i][j].spanning_v = True + elif not self.cells[r][c].bottom: + if (self.cells[r][c].left and + self.cells[r][c].right and + self.cells[r][c].top): + self.cells[r][c].spanning_v = True elif bound == 2: - if self.cells[i][j].left and self.cells[i][j].right: - if (not self.cells[i][j].top and - not self.cells[i][j].bottom): - self.cells[i][j].spanning_v = True + if self.cells[r][c].left and self.cells[r][c].right: + if (not self.cells[r][c].top and + not self.cells[r][c].bottom): + self.cells[r][c].spanning_v = True - elif self.cells[i][j].top and self.cells[i][j].bottom: - if (not self.cells[i][j].left and - not self.cells[i][j].right): - self.cells[i][j].spanning_h = True + elif self.cells[r][c].top and self.cells[r][c].bottom: + if (not self.cells[r][c].left and + not self.cells[r][c].right): + self.cells[r][c].spanning_h = True return self @@ -213,7 +235,7 @@ class Table: ar : list """ ar = [] - for i in range(len(self.cells)): - ar.append([self.cells[i][j].get_text().strip() - for j in range(len(self.cells[i]))]) + for r in range(len(self.rows)): + ar.append([self.cells[r][c].get_text().strip() + for c in range(len(self.cols))]) return ar diff --git a/camelot/utils.py b/camelot/utils.py index 42d01f6..e798dd8 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1,5 +1,6 @@ from __future__ import division import os +import logging import numpy as np @@ -11,7 +12,8 @@ from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal, LTTextLineVertical +from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, + LTTextLineVertical) def translate(x1, x2): @@ -174,22 +176,20 @@ def scale_to_pdf(tables, v_segments, h_segments, factors): return tables_new, v_segments_new, h_segments_new -def get_rotation(ltchar, lttextlh=None, lttextlv=None): +def get_rotation(lttextlh, lttextlv, ltchar): """Detects if text in table is vertical or not using the current transformation matrix (CTM) and returns its orientation. Parameters ---------- - ltchar : list - List of PDFMiner LTChar objects. - lttextlh : list List of PDFMiner LTTextLineHorizontal objects. - (optional, default: None) lttextlv : list List of PDFMiner LTTextLineVertical objects. - (optional, default: None) + + ltchar : list + List of PDFMiner LTChar objects. Returns ------- @@ -199,15 +199,9 @@ def get_rotation(ltchar, lttextlh=None, lttextlv=None): anti-clockwise and 'right' if rotated 90 degree clockwise. """ rotation = '' - if lttextlh is not None and lttextlv is not None: - hlen = len([t for t in lttextlh if t.get_text().strip()]) - vlen = len([t for t in lttextlv if t.get_text().strip()]) - vger = 0.0 - else: - hlen = len([t for t in ltchar if t.upright and t.get_text().strip()]) - vlen = len([t for t in ltchar if (not t.upright) and t.get_text().strip()]) - vger = vlen / float(hlen+vlen) - if hlen < vlen or vger > 0.8: + hlen = len([t for t in lttextlh if t.get_text().strip()]) + vlen = len([t for t in lttextlv if t.get_text().strip()]) + if hlen < vlen: clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar) rotation = 'left' if clockwise < anticlockwise else 'right' @@ -247,7 +241,183 @@ def segments_bbox(bbox, v_segments, h_segments): return v_s, h_s -def text_bbox(bbox, text): +def rotate_segments(v_s, h_s, table_rotation): + """Rotates line segments if the table is rotated. + + Parameters + ---------- + v : list + List of vertical line segments. + + h : list + List of horizontal line segments. + + table_rotation : string + {'', 'left', 'right'} + + + Returns + ------- + vertical : list + List of rotated vertical line segments. + + horizontal : list + List of rotated horizontal line segments. + """ + vertical, horizontal = [], [] + if table_rotation != '': + if table_rotation == 'left': + for v in v_s: + x0, y0 = rotate(0, 0, v[0], v[1], -np.pi / 2) + x1, y1 = rotate(0, 0, v[2], v[3], -np.pi / 2) + horizontal.append((x0, y0, x1, y1)) + for h in h_s: + x0, y0 = rotate(0, 0, h[0], h[1], -np.pi / 2) + x1, y1 = rotate(0, 0, h[2], h[3], -np.pi / 2) + vertical.append((x1, y1, x0, y0)) + elif table_rotation == 'right': + for v in v_s: + x0, y0 = rotate(0, 0, v[0], v[1], np.pi / 2) + x1, y1 = rotate(0, 0, v[2], v[3], np.pi / 2) + horizontal.append((x1, y1, x0, y0)) + for h in h_s: + x0, y0 = rotate(0, 0, h[0], h[1], np.pi / 2) + x1, y1 = rotate(0, 0, h[2], h[3], np.pi / 2) + vertical.append((x0, y0, x1, y1)) + else: + vertical = v_s + horizontal = h_s + return vertical, horizontal + + +def rotate_textlines(lh_bbox, lv_bbox, table_rotation): + """Rotates bounding boxes of LTTextLineHorizontals and + LTTextLineVerticals if the table is rotated. + + Parameters + ---------- + lh_bbox : list + List of PDFMiner LTTextLineHorizontal objects. + + lv_bbox : list + List of PDFMiner LTTextLineVertical objects. + + table_rotation : string + {'', 'left', 'right'} + + Returns + ------- + t_bbox : dict + Dict with two keys 'horizontal' and 'vertical' with lists of + LTTextLineHorizontals and LTTextLineVerticals respectively. + """ + t_bbox = {} + if table_rotation != '': + if table_rotation == 'left': + for t in lh_bbox: + x0, y0, x1, y1 = t.bbox + x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2) + x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2) + t.set_bbox((x1, y0, x0, y1)) + for obj in t._objs: + if isinstance(obj, LTChar): + x0, y0, x1, y1 = obj.bbox + x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2) + x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2) + obj.set_bbox((x1, y0, x0, y1)) + for t in lv_bbox: + x0, y0, x1, y1 = t.bbox + x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2) + x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2) + t.set_bbox((x0, y1, x1, y0)) + for obj in t._objs: + if isinstance(obj, LTChar): + x0, y0, x1, y1 = obj.bbox + x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2) + x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2) + obj.set_bbox((x0, y1, x1, y0)) + elif table_rotation == 'right': + for t in lh_bbox: + x0, y0, x1, y1 = t.bbox + x0, y0 = rotate(0, 0, x0, y0, np.pi / 2) + x1, y1 = rotate(0, 0, x1, y1, np.pi / 2) + t.set_bbox((x0, y1, x1, y0)) + for obj in t._objs: + if isinstance(obj, LTChar): + x0, y0, x1, y1 = obj.bbox + x0, y0 = rotate(0, 0, x0, y0, np.pi / 2) + x1, y1 = rotate(0, 0, x1, y1, np.pi / 2) + obj.set_bbox((x0, y1, x1, y0)) + for t in lv_bbox: + x0, y0, x1, y1 = t.bbox + x0, y0 = rotate(0, 0, x0, y0, np.pi / 2) + x1, y1 = rotate(0, 0, x1, y1, np.pi / 2) + t.set_bbox((x1, y0, x0, y1)) + for obj in t._objs: + if isinstance(obj, LTChar): + x0, y0, x1, y1 = obj.bbox + x0, y0 = rotate(0, 0, x0, y0, np.pi / 2) + x1, y1 = rotate(0, 0, x1, y1, np.pi / 2) + obj.set_bbox((x1, y0, x0, y1)) + t_bbox['horizontal'] = lv_bbox + t_bbox['vertical'] = lh_bbox + else: + t_bbox['horizontal'] = lh_bbox + t_bbox['vertical'] = lv_bbox + return t_bbox + + +def rotate_table(R, C, table_rotation): + """Rotates coordinates of table rows and columns. + + Parameters + ---------- + R : list + List of row x-coordinates. + + C : list + List of column y-coordinates. + + table_rotation : string + {'', 'left', 'right'} + + Returns + ------- + rows : list + List of rotated row x-coordinates. + + cols : list + List of rotated column y-coordinates. + """ + rows, cols = [], [] + if table_rotation != '': + if table_rotation == 'left': + for r in R: + r0, r1 = rotate(0, 0, 0, r[0], -np.pi / 2) + r2, r3 = rotate(0, 0, 0, r[1], -np.pi / 2) + cols.append((r2, r0)) + cols = sorted(cols) + for c in C: + c0, c1 = rotate(0, 0, c[0], 0, -np.pi / 2) + c2, c3 = rotate(0, 0, c[1], 0, -np.pi / 2) + rows.append((c1, c3)) + elif table_rotation == 'right': + for r in R: + r0, r1 = rotate(0, 0, 0, r[0], np.pi / 2) + r2, r3 = rotate(0, 0, 0, r[1], np.pi / 2) + cols.append((r0, r2)) + for c in C: + c0, c1 = rotate(0, 0, c[0], 0, np.pi / 2) + c2, c3 = rotate(0, 0, c[1], 0, np.pi / 2) + rows.append((c3, c1)) + rows = sorted(rows, reverse=True) + else: + rows = R + cols = C + return rows, cols + + +def text_in_bbox(bbox, text): """Returns all text objects present inside a table's bounding box. @@ -330,66 +500,141 @@ def merge_close_values(ar, mtol=2): return ret -def get_row_index(t, rows): - """Gets index of the row in which the given text object lies by - comparing their y-coordinates. +def split_textline(table, textline, direction): + """Splits PDFMiner LTTextLine into substrings if it spans across + multiple rows/columns. Parameters ---------- - t : object + table : object + camelot.pdf.Pdf - rows : list - List of row coordinate tuples, sorted in decreasing order. + textline : object + PDFMiner LTTextLine object. + + direction : string + {'horizontal', 'vertical'} + Direction of the PDFMiner LTTextLine object. Returns ------- - r : int - - error : float + cut_text : list + List of tuples of the form (idx, text) where idx is the index + of row/column and text is the an lttextline substring. """ - offset1, offset2 = 0, 0 - for r in range(len(rows)): - if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]: - if t.y0 > rows[r][0]: - offset1 = abs(t.y0 - rows[r][0]) - if t.y1 < rows[r][1]: - offset2 = abs(t.y1 - rows[r][1]) - X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) - Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) - charea = X * Y - error = (X * (offset1 + offset2)) / charea - return r, error + idx = 0 + cut_text = [] + bbox = textline.bbox + if direction == 'horizontal' and not textline.is_empty(): + x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]] + r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]] + r = r_idx[0] + x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right] + if not x_cuts: + x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] + for obj in textline._objs: + row = table.rows[r] + for cut in x_cuts: + if isinstance(obj, LTChar): + if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and + (obj.x0 + obj.x1) / 2 <= cut[1]): + cut_text.append((r, cut[0], obj.get_text().strip('\n'))) + break + elif isinstance(obj, LTAnno): + cut_text.append((r, cut[0], obj.get_text().strip('\n'))) + elif direction == 'vertical' and not textline.is_empty(): + y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]] + c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]] + c = c_idx[0] + y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom] + if not y_cuts: + y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] + for obj in textline._objs: + col = table.cols[c] + for cut in y_cuts: + if isinstance(obj, LTChar): + if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and + (obj.y0 + obj.y1) / 2 >= cut[1]): + cut_text.append((cut[0], c, obj.get_text())) + break + elif isinstance(obj, LTAnno): + cut_text.append((cut[0], c, obj.get_text().strip('\n'))) + return cut_text -def get_column_index(t, columns): - """Gets index of the column in which the given text object lies by - comparing their x-coordinates. +def get_table_index(table, t, direction, split_text=False): + """Gets indices of the cell where given text object lies by + comparing their y and x-coordinates. Parameters ---------- - t : object + table : object + camelot.table.Table - columns : list - List of column coordinate tuples. + t : object + PDFMiner LTTextLine object. + + direction : string + {'horizontal', 'vertical'} + Direction of the PDFMiner LTTextLine object. + + split_text : bool + Whether or not to split a text line if it spans across + multiple cells. + (optional, default: False) Returns ------- - c : int + indices : list + List of tuples of the form (idx, text) where idx is the index + of row/column and text is the an lttextline substring. error : float + Assignment error, percentage of text area that lies outside + a cell. + +-------+ + | | + | [Text bounding box] + | | + +-------+ """ - offset1, offset2 = 0, 0 - for c in range(len(columns)): - if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]: - if t.x0 < columns[c][0]: - offset1 = abs(t.x0 - columns[c][0]) - if t.x1 > columns[c][1]: - offset2 = abs(t.x1 - columns[c][1]) - X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) - Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) - charea = X * Y - error = (Y * (offset1 + offset2)) / charea - return c, error + r_idx, c_idx = [-1] * 2 + for r in range(len(table.rows)): + if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and + (t.y0 + t.y1) / 2.0 > table.rows[r][1]): + lt_col_overlap = [] + for c in table.cols: + if c[0] <= t.x1 and c[1] >= t.x0: + left = t.x0 if c[0] <= t.x0 else c[0] + right = t.x1 if c[1] >= t.x1 else c[1] + lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1])) + else: + lt_col_overlap.append(-1) + if len(filter(lambda x: x != -1, lt_col_overlap)) == 0: + logging.warning("Text doesn't fit any column.") + r_idx = r + c_idx = lt_col_overlap.index(max(lt_col_overlap)) + break + + # error calculation + y0_offset, y1_offset, x0_offset, x1_offset = [0] * 4 + if t.y0 > table.rows[r_idx][0]: + y0_offset = abs(t.y0 - table.rows[r_idx][0]) + if t.y1 < table.rows[r_idx][1]: + y1_offset = abs(t.y1 - table.rows[r_idx][1]) + if t.x0 < table.cols[c_idx][0]: + x0_offset = abs(t.x0 - table.cols[c_idx][0]) + if t.x1 > table.cols[c_idx][1]: + x1_offset = abs(t.x1 - table.cols[c_idx][1]) + X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) + Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) + charea = X * Y + error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea + + if split_text: + return split_textline(table, t, direction), error + else: + return [(r_idx, c_idx, t.get_text().strip('\n'))], error def get_score(error_weights): @@ -448,9 +693,14 @@ def count_empty(d): Returns ------- - n_empty_rows : number of empty rows - n_empty_cols : number of empty columns - empty_p : percentage of empty cells + n_empty_rows : list + Number of empty rows. + + n_empty_cols : list + Number of empty columns. + + empty_p : float + Percentage of empty cells. """ empty_p = 0 r_nempty_cells, c_nempty_cells = [], [] @@ -491,7 +741,7 @@ def encode_list(ar): return ar -def get_text_objects(layout, LTType="char", t=None): +def get_text_objects(layout, ltype="char", t=None): """Recursively parses pdf layout to get a list of text objects. @@ -500,7 +750,7 @@ def get_text_objects(layout, LTType="char", t=None): layout : object PDFMiner LTPage object. - LTType : string + ltype : string {'char', 'lh', 'lv'} Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal, and LTTextLineVertical objects respectively. @@ -512,11 +762,11 @@ def get_text_objects(layout, LTType="char", t=None): t : list List of PDFMiner text objects. """ - if LTType == "char": + if ltype == "char": LTObject = LTChar - elif LTType == "lh": + elif ltype == "lh": LTObject = LTTextLineHorizontal - elif LTType == "lv": + elif ltype == "lv": LTObject = LTTextLineVertical if t is None: t = [] @@ -525,7 +775,7 @@ def get_text_objects(layout, LTType="char", t=None): if isinstance(obj, LTObject): t.append(obj) else: - t += get_text_objects(obj, LTType=LTType) + t += get_text_objects(obj, ltype=ltype) except AttributeError: pass return t diff --git a/tests/test_lattice.py b/tests/test_lattice.py index 566217a..818e16a 100644 --- a/tests/test_lattice.py +++ b/tests/test_lattice.py @@ -32,7 +32,7 @@ def test_lattice_basic(): def test_lattice_fill(): data = [ - ["Plan Type","County","Plan Name","Totals"], + ["Plan Type","County","Plan Name","Totals"], ["GMC","Sacramento","Anthem Blue Cross","164,380"], ["GMC","Sacramento","Health Net","126,547"], ["GMC","Sacramento","Kaiser Foundation","74,620"], @@ -122,46 +122,4 @@ def test_lattice_table_rotation(): pdfname = os.path.join(testdir, 'right_rotated_table_1.pdf') manager = Pdf(Lattice(), pdfname, clean=True) tables = manager.extract() - assert_equal(tables['page-1']['table-1']['data'], data) - -def test_lattice_cell_rotation(): - - data = [ - ["Sl.No.","District","Projected Population for 2012-13(In lakhs)","Adult Equivalent to 88%(In lakhs)","Total Consumptionrequirement(@ 400gms/adult/day)(In Lakh tonnes)","Total Requirement(Including seeds, feeds & wastage)(In Lakh tonnes)","Production (Rice)(In Lakh tonnes)","","","Surplus/Defi cit(In Lakh tonnes)",""], - ["","","","","","","Kharif","Rabi","Total","Rice","Paddy"], - ["1","Balasore","23.65","20.81","3.04","3.47","2.78","0.86","3.64","0.17","0.25"], - ["2","Bhadrak","15.34","13.50","1.97","2.25","3.50","0.05","3.55","1.30","1.94"], - ["3","Balangir","17.01","14.97","2.19","2.50","6.23","0.10","6.33","3.83","5.72"], - ["4","Subarnapur","6.70","5.90","0.86","0.98","4.48","1.13","5.61","4.63","6.91"], - ["5","Cuttack","26.63","23.43","3.42","3.91","3.75","0.06","3.81","-0.10","-0.15"], - ["6","Jagatsingpur","11.49","10.11","1.48","1.69","2.10","0.02","2.12","0.43","0.64"], - ["7","Jajpur","18.59","16.36","2.39","2.73","2.13","0.04","2.17","-0.56","-0.84"], - ["8","Kendrapara","14.62","12.87","1.88","2.15","2.60","0.07","2.67","0.52","0.78"], - ["9","Dhenkanal","12.13","10.67","1.56","1.78","2.26","0.02","2.28","0.50","0.75"], - ["10","Angul","12.93","11.38","1.66","1.90","1.73","0.02","1.75","-0.15","-0.22"], - ["11","Ganjam","35.77","31.48","4.60","5.26","4.57","0.00","4.57","-0.69","-1.03"], - ["12","Gajapati","5.85","5.15","0.75","0.86","0.68","0.01","0.69","-0.17","-0.25"], - ["13","Kalahandi","16.12","14.19","2.07","2.37","5.42","1.13","6.55","4.18","6.24"], - ["14","Nuapada","6.18","5.44","0.79","0.90","1.98","0.08","2.06","1.16","1.73"], - ["15","Keonjhar","18.42","16.21","2.37","2.71","2.76","0.08","2.84","0.13","0.19"], - ["16","Koraput","14.09","12.40","1.81","2.07","2.08","0.34","2.42","0.35","0.52"], - ["17","Malkangiri","6.31","5.55","0.81","0.93","1.78","0.04","1.82","0.89","1.33"], - ["18","Nabarangpur","12.50","11.00","1.61","1.84","3.26","0.02","3.28","1.44","2.15"], - ["19","Rayagada","9.83","8.65","1.26","1.44","1.15","0.03","1.18","-0.26","-0.39"], - ["20","Mayurbhanj","25.61","22.54","3.29","3.76","4.90","0.06","4.96","1.20","1.79"], - ["21","Kandhamal","7.45","6.56","0.96","1.10","0.70","0.01","0.71","-0.39","-0.58"], - ["22","Boudh","4.51","3.97","0.58","0.66","1.73","0.03","1.76","1.10","1.64"], - ["23","Puri","17.29","15.22","2.22","2.54","2.45","0.99","3.44","0.90","1.34"], - ["24","Khordha","23.08","20.31","2.97","3.39","2.02","0.03","2.05","-1.34","-2.00"], - ["25","Nayagarh","9.78","8.61","1.26","1.44","2.10","0.00","2.10","0.66","0.99"], - ["26","Sambalpur","10.62","9.35","1.37","1.57","3.45","0.71","4.16","2.59","3.87"], - ["27","Bargarh","15.00","13.20","1.93","2.21","6.87","2.65","9.52","7.31","10.91"], - ["28","Deogarh","3.18","2.80","0.41","0.47","1.12","0.07","1.19","0.72","1.07"], - ["29","Jharsuguda","5.91","5.20","0.76","0.87","0.99","0.01","1.00","0.13","0.19"], - ["30","Sundargarh","21.21","18.66","2.72","3.11","4.72","0.02","4.74","1.63","2.43"], - ["ODISHA","","427.80","376.49","54.99","62.86","86.29","8.68","94.97","32.11","47.92"] - ] - pdfname = os.path.join(testdir, 'agstat.pdf') - manager = Pdf(Lattice(), pdfname, clean=True) - tables = manager.extract() assert_equal(tables['page-1']['table-1']['data'], data) \ No newline at end of file diff --git a/tests/test_stream.py b/tests/test_stream.py index 3535950..a23ad08 100644 --- a/tests/test_stream.py +++ b/tests/test_stream.py @@ -169,45 +169,45 @@ def test_stream_columns(): def test_stream_table_rotation(): data = [ - ["Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","","",""], - ["","","","","","Modern method","","","","","","","Traditional method","","","",""], - ["","","Any","","","","","","","Other","Any","","","","Not","","Number"], - ["","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"], - ["Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"], - ["Caste/tribe","","","","","","","","","","","","","","","",""], - ["Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"], - ["Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"], - ["Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"], - ["Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"], - ["Wealth index","","","","","","","","","","","","","","","",""], - ["Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"], - ["Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"], - ["Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"], - ["Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"], - ["Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"], - ["Number of living children","","","","","","","","","","","","","","","",""], - ["No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"], - ["1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"], - ["1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"], - ["No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"], - ["2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"], - ["1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"], - ["No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"], - ["3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"], - ["1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"], - ["No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"], - ["4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"], - ["1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"], - ["No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"], - ["Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"], - ["NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"], - ["NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"], - ["","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""], - ["not shown separately.","","","","","","","","","","","","","","","",""], - ["na = Not available","","","","","","","","","","","","","","","",""], - ["","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""], - ["( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""], - ["","","","","","","","54","","","","","","","","",""] + ["","","Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","",""], + ["","","","","","","Modern method","","","","","","","Traditional method","","","",""], + ["","","","Any","","","","","","","Other","Any","","","","Not","","Number"], + ["","","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"], + ["","Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"], + ["","Caste/tribe","","","","","","","","","","","","","","","",""], + ["","Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"], + ["","Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"], + ["","Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"], + ["","Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"], + ["","Wealth index","","","","","","","","","","","","","","","",""], + ["","Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"], + ["","Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"], + ["","Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"], + ["","Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"], + ["","Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"], + ["","Number of living children","","","","","","","","","","","","","","","",""], + ["","No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"], + ["","1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"], + ["","1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"], + ["","No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"], + ["","2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"], + ["","1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"], + ["","No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"], + ["","3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"], + ["","1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"], + ["","No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"], + ["","4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"], + ["","1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"], + ["","No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"], + ["","Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"], + ["","NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"], + ["","NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"], + ["","","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""], + ["","not shown separately.","","","","","","","","","","","","","","","",""], + ["","na = Not available","","","","","","","","","","","","","","","",""], + ["","","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""], + ["","( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""], + ["","","","","","","","","54","","","","","","","","",""] ] pdfname = os.path.join(testdir, "left_rotated_table_2.pdf") manager = Pdf(Stream(), pdfname, clean=True)