Replace chars with textlines

* Add split function

* Add split_text and shift_text params

* Change get_rotation

* Move get_column_index to utils

* Add split_text and shift_text

* Fix split_text
pull/2/head
Vinayak Mehta 2016-10-12 13:17:02 +05:30 committed by GitHub
parent 02ef332bd6
commit a43d5ca2c7
7 changed files with 590 additions and 428 deletions

View File

@ -8,10 +8,10 @@ import subprocess
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints)
from .table import Table
from .utils import (scale_to_pdf, scale_to_image, get_rotation, segments_bbox,
text_bbox, merge_close_values, get_row_index,
get_column_index, get_score, count_empty, encode_list,
get_text_objects, get_page_layout)
from .utils import (scale_to_pdf, scale_to_image, get_rotation, rotate_segments,
rotate_textlines, rotate_table, segments_bbox, text_in_bbox,
merge_close_values, get_table_index, get_score, count_empty,
encode_list, get_text_objects, get_page_layout)
__all__ = ['Lattice']
@ -25,6 +25,52 @@ def _reduce_method(m):
copy_reg.pickle(types.MethodType, _reduce_method)
def _reduce_index(t, idx, shift_text):
"""Reduces index of a text object if it lies within a spanning
cell taking in account table rotation.
Parameters
----------
table : object
camelot.table.Table
idx : list
List of tuples of the form (r_idx, c_idx, text).
shift_text : list
{'l', 'r', 't', 'b'}
Select one or more from above and pass them as a list to
specify where the text in a spanning cell should flow.
Returns
-------
indices : list
List of tuples of the form (idx, text) where idx is the reduced
index of row/column and text is the an lttextline substring.
"""
indices = []
for r_idx, c_idx, text in idx:
for d in shift_text:
if d == 'l':
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if d == 'r':
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].right:
c_idx += 1
if d == 't':
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
if d == 'b':
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].bottom:
r_idx += 1
indices.append((r_idx, c_idx, text))
return indices
def _fill_spanning(t, fill=None):
"""Fills spanning cells.
@ -67,78 +113,6 @@ def _fill_spanning(t, fill=None):
return t
def _outline(t):
"""Sets table border edges to True.
Parameters
----------
t : object
camelot.table.Table
Returns
-------
t : object
camelot.table.Table
"""
for i in range(len(t.cells)):
t.cells[i][0].left = True
t.cells[i][len(t.cells[i]) - 1].right = True
for i in range(len(t.cells[0])):
t.cells[0][i].top = True
t.cells[len(t.cells) - 1][i].bottom = True
return t
def _reduce_index(t, rotation, r_idx, c_idx):
"""Reduces index of a text object if it lies within a spanning
cell taking in account table rotation.
Parameters
----------
t : object
camelot.table.Table
rotation : string
{'', 'left', 'right'}
r_idx : int
Current row index.
c_idx : int
Current column index.
Returns
-------
r_idx : int
Reduced row index.
c_idx : int
Reduced column index.
"""
if not rotation:
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
elif rotation == 'left':
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].bottom:
r_idx += 1
elif rotation == 'right':
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].right:
c_idx += 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
return r_idx, c_idx
class Lattice:
"""Lattice looks for lines in the pdf to form a table.
@ -179,6 +153,17 @@ class Lattice:
PDFMiner margins. (char_margin, line_margin, word_margin)
(optional, default: (1.0, 0.5, 0.1))
split_text : bool
Whether or not to split a text line if it spans across
different cells.
(optional, default: False)
shift_text : list
{'l', 'r', 't', 'b'}
Select one or more from above and pass them as a list to
specify where the text in a spanning cell should flow.
(optional, default: ['l', 't'])
debug : string
{'contour', 'line', 'joint', 'table'}
Set to one of the above values to generate a matplotlib plot
@ -186,7 +171,8 @@ class Lattice:
(optional, default: None)
"""
def __init__(self, table_area=None, fill=None, mtol=[2], scale=15,
invert=False, margins=(1.0, 0.5, 0.1), debug=None):
invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
shift_text=['l', 't'], debug=None):
self.method = 'lattice'
self.table_area = table_area
@ -195,6 +181,8 @@ class Lattice:
self.scale = scale
self.invert = invert
self.char_margin, self.line_margin, self.word_margin = margins
self.split_text = split_text
self.shift_text = shift_text
self.debug = debug
def get_tables(self, pdfname):
@ -211,9 +199,9 @@ class Lattice:
"""
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
line_margin=self.line_margin, word_margin=self.word_margin)
ltchar = get_text_objects(layout, LTType="char")
lttextlh = get_text_objects(layout, LTType="lh")
lttextlv = get_text_objects(layout, LTType="lv")
lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char")
width, height = dim
bname, __ = os.path.splitext(pdfname)
if not ltchar:
@ -287,11 +275,15 @@ class Lattice:
# select elements which lie within table_bbox
table_data = {}
v_s, h_s = segments_bbox(k, v_segments, h_segments)
char_bbox = text_bbox(k, ltchar)
lh_bbox = text_bbox(k, lttextlh)
lv_bbox = text_bbox(k, lttextlv)
lh_bbox = text_in_bbox(k, lttextlh)
lv_bbox = text_in_bbox(k, lttextlv)
char_bbox = text_in_bbox(k, ltchar)
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
table_rotation = get_rotation(char_bbox, lh_bbox, lv_bbox)
table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
v_s, h_s = rotate_segments(v_s, h_s, table_rotation)
t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
for direction in t_bbox:
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
cols, rows = zip(*table_bbox[k])
cols, rows = list(cols), list(rows)
cols.extend([k[0], k[2]])
@ -305,6 +297,7 @@ class Lattice:
for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
rows, cols = rotate_table(rows, cols, table_rotation)
table = Table(cols, rows)
# set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s)
@ -313,58 +306,26 @@ class Lattice:
# set spanning cells to True
table = table.set_spanning()
# set table border edges to True
table = _outline(table)
table = table.set_border_edges()
if self.debug:
self.debug_tables.append(table)
rerror = []
cerror = []
for t in char_bbox:
try:
r_idx, rass_error = get_row_index(t, rows)
except TypeError:
# couldn't assign LTChar to any cell
continue
try:
c_idx, cass_error = get_column_index(t, cols)
except TypeError:
# couldn't assign LTChar to any cell
continue
rerror.append(rass_error)
cerror.append(cass_error)
r_idx, c_idx = _reduce_index(table, table_rotation, r_idx, c_idx)
table.cells[r_idx][c_idx].add_object(t)
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
t_bbox = table.cells[i][j].get_objects()
try:
cell_rotation = get_rotation(t_bbox)
except ZeroDivisionError:
cell_rotation = ''
pass
# fill text after sorting it
if cell_rotation == '':
t_bbox.sort(key=lambda x: (-x.y0, x.x0))
elif cell_rotation == 'left':
t_bbox.sort(key=lambda x: (x.x0, x.y0))
elif cell_rotation == 'right':
t_bbox.sort(key=lambda x: (-x.x0, -x.y0))
table.cells[i][j].add_text(''.join([t.get_text()
for t in t_bbox]))
score = get_score([[50, rerror], [50, cerror]])
assignment_errors = []
for direction in t_bbox:
for t in t_bbox[direction]:
indices, error = get_table_index(
table, t, direction, split_text=self.split_text)
assignment_errors.append(error)
indices = _reduce_index(table, indices, shift_text=self.shift_text)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].add_text(text)
score = get_score([[100, assignment_errors]])
table_data['score'] = score
if self.fill is not None:
table = _fill_spanning(table, fill=self.fill[table_no])
ar = table.get_list()
if table_rotation == 'left':
ar = zip(*ar[::-1])
elif table_rotation == 'right':
ar = zip(*ar[::1])
ar.reverse()
ar = encode_list(ar)
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)

View File

@ -196,28 +196,28 @@ class Pdf:
try:
for tables in self.debug_tables:
for table in tables:
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
if table.cells[i][j].left:
plt.plot([table.cells[i][j].lb[0],
table.cells[i][j].lt[0]],
[table.cells[i][j].lb[1],
table.cells[i][j].lt[1]])
if table.cells[i][j].right:
plt.plot([table.cells[i][j].rb[0],
table.cells[i][j].rt[0]],
[table.cells[i][j].rb[1],
table.cells[i][j].rt[1]])
if table.cells[i][j].top:
plt.plot([table.cells[i][j].lt[0],
table.cells[i][j].rt[0]],
[table.cells[i][j].lt[1],
table.cells[i][j].rt[1]])
if table.cells[i][j].bottom:
plt.plot([table.cells[i][j].lb[0],
table.cells[i][j].rb[0]],
[table.cells[i][j].lb[1],
table.cells[i][j].rb[1]])
for r in range(len(table.rows)):
for c in range(len(table.cols)):
if table.cells[r][c].left:
plt.plot([table.cells[r][c].lb[0],
table.cells[r][c].lt[0]],
[table.cells[r][c].lb[1],
table.cells[r][c].lt[1]])
if table.cells[r][c].right:
plt.plot([table.cells[r][c].rb[0],
table.cells[r][c].rt[0]],
[table.cells[r][c].rb[1],
table.cells[r][c].rt[1]])
if table.cells[r][c].top:
plt.plot([table.cells[r][c].lt[0],
table.cells[r][c].rt[0]],
[table.cells[r][c].lt[1],
table.cells[r][c].rt[1]])
if table.cells[r][c].bottom:
plt.plot([table.cells[r][c].lb[0],
table.cells[r][c].rb[0]],
[table.cells[r][c].lb[1],
table.cells[r][c].rb[1]])
plt.show()
except AttributeError:
raise ValueError("This option only be used with Lattice.")

View File

@ -7,8 +7,9 @@ import copy_reg
import numpy as np
from .table import Table
from .utils import (rotate, get_rotation, text_bbox, get_row_index, get_score,
count_empty, encode_list, get_text_objects, get_page_layout)
from .utils import (rotate, get_rotation, rotate_textlines, text_in_bbox,
get_table_index, get_score, count_empty, encode_list,
get_text_objects, get_page_layout)
__all__ = ['Stream']
@ -22,6 +23,29 @@ def _reduce_method(m):
copy_reg.pickle(types.MethodType, _reduce_method)
def _text_bbox(t_bbox):
"""Returns bounding box for the text present on a page.
Parameters
----------
t_bbox : dict
Dict with two keys 'horizontal' and 'vertical' with lists of
LTTextLineHorizontals and LTTextLineVerticals respectively.
Returns
-------
text_bbox : tuple
Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate
space.
"""
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
text_bbox = (xmin, ymin, xmax, ymax)
return text_bbox
def _group_rows(text, ytol=2):
"""Groups PDFMiner text objects into rows using their
y-coordinates taking into account some tolerance ytol.
@ -185,45 +209,6 @@ def _add_columns(cols, text, ytol):
return cols
def _get_column_index(t, columns):
"""Gets index of the column in which the given text object lies by
comparing their x-coordinates.
Parameters
----------
t : object
columns : list
List of column coordinate tuples.
Returns
-------
c_idx : int
error : float
"""
offset1, offset2 = 0, 0
lt_col_overlap = []
for c in columns:
if c[0] <= t.x1 and c[1] >= t.x0:
left = t.x0 if c[0] <= t.x0 else c[0]
right = t.x1 if c[1] >= t.x1 else c[1]
lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
else:
lt_col_overlap.append(-1)
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
logging.warning("Text doesn't fit any column.")
c_idx = lt_col_overlap.index(max(lt_col_overlap))
if t.x0 < columns[c_idx][0]:
offset1 = abs(t.x0 - columns[c_idx][0])
if t.x1 > columns[c_idx][1]:
offset2 = abs(t.x1 - columns[c_idx][1])
Y = abs(t.y0 - t.y1)
charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1)
error = (Y * (offset1 + offset2)) / charea
return c_idx, error
class Stream:
"""Stream looks for spaces between text elements to form a table.
@ -265,13 +250,19 @@ class Stream:
PDFMiner margins. (char_margin, line_margin, word_margin)
(optional, default: (1.0, 0.5, 0.1))
split_text : bool
Whether or not to split a text line if it spans across
different cells.
(optional, default: False)
debug : bool
Set to True to generate a matplotlib plot of
LTTextLineHorizontals in order to select table_area, columns.
(optional, default: False)
"""
def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
mtol=[0], margins=(1.0, 0.5, 0.1), debug=False):
mtol=[0], margins=(1.0, 0.5, 0.1), split_text=False,
debug=False):
self.method = 'stream'
self.table_area = table_area
@ -280,6 +271,7 @@ class Stream:
self.ytol = ytol
self.mtol = mtol
self.char_margin, self.line_margin, self.word_margin = margins
self.split_text = split_text
self.debug = debug
def get_tables(self, pdfname):
@ -296,9 +288,9 @@ class Stream:
"""
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
line_margin=self.line_margin, word_margin=self.word_margin)
ltchar = get_text_objects(layout, LTType="char")
lttextlh = get_text_objects(layout, LTType="lh")
lttextlv = get_text_objects(layout, LTType="lv")
lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char")
width, height = dim
bname, __ = os.path.splitext(pdfname)
if not lttextlh:
@ -308,6 +300,8 @@ class Stream:
if self.debug:
self.debug_text = []
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
if self.table_area is not None:
if self.columns is not None:
@ -339,34 +333,16 @@ class Stream:
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
# select elements which lie within table_bbox
table_data = {}
table_rotation = get_rotation(ltchar, lttextlh, lttextlv)
if table_rotation != '':
t_bbox = text_bbox(k, lttextlv)
if table_rotation == 'left':
if self.debug:
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
for t in t_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
t.set_bbox((x0, y1, x1, y0))
elif table_rotation == 'right':
for t in t_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
t.set_bbox((x1, y0, x0, y1))
else:
if self.debug:
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
t_bbox = text_bbox(k, lttextlh)
t_bbox.sort(key=lambda x: (-x.y0, x.x0))
text_x_min = min([t.x0 for t in t_bbox])
text_y_min = min([t.y0 for t in t_bbox])
text_x_max = max([t.x1 for t in t_bbox])
text_y_max = max([t.y1 for t in t_bbox])
rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no])
lh_bbox = text_in_bbox(k, lttextlh)
lv_bbox = text_in_bbox(k, lttextlv)
char_bbox = text_in_bbox(k, ltchar)
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
for direction in t_bbox:
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
rows_grouped = _group_rows(t_bbox['horizontal'], ytol=self.ytol[table_no])
rows = _join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
@ -402,9 +378,9 @@ class Stream:
len_non_mode = len(filter(lambda x: x != ncols, elements))
if ncols == 1 and not self.debug:
# no tables detected
logging.warning("{}: Only one column was detected, the PDF"
logging.warning("{}: Only one column was detected, the pdf"
" may have no tables. Specify ncols if"
" the PDF has tables.".format(
" the pdf has tables.".format(
os.path.basename(bname)))
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
@ -413,35 +389,30 @@ class Stream:
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend([t for t in t_bbox if t.x0 > left and t.x1 < right])
outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend([t for direction in t_bbox
for t in t_bbox[direction]
if t.x0 > left and t.x1 < right])
outer_text = [t for direction in t_bbox
for t in t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol[table_no])
cols = _join_columns(cols, text_x_min, text_x_max)
table = Table(cols, rows)
rerror = []
cerror = []
for row in rows_grouped:
for t in row:
try:
r_idx, rass_error = get_row_index(t, rows)
except ValueError as e:
# couldn't assign LTTextLH to any cell
continue
try:
c_idx, cass_error = _get_column_index(t, cols)
except ValueError as e:
# couldn't assign LTTextLH to any cell
continue
rerror.append(rass_error)
cerror.append(cass_error)
table.cells[r_idx][c_idx].add_text(
t.get_text().strip('\n'))
table = table.set_all_edges()
assignment_errors = []
for direction in t_bbox:
for t in t_bbox[direction]:
indices, error = get_table_index(
table, t, direction, split_text=self.split_text)
assignment_errors.append(error)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].add_text(text)
if guess:
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
score = get_score([[66, assignment_errors], [34, [len_non_mode / len(elements)]]])
else:
score = get_score([[50, rerror], [50, cerror]])
score = get_score([[100, assignment_errors]])
table_data['score'] = score
ar = table.get_list()

View File

@ -35,6 +35,28 @@ class Table:
for c in cols] for r in rows]
self.nocont_ = 0
def set_all_edges(self):
"""Sets all table edges to True.
"""
for r in range(len(self.rows)):
for c in range(len(self.cols)):
self.cells[r][c].left = True
self.cells[r][c].right = True
self.cells[r][c].top = True
self.cells[r][c].bottom = True
return self
def set_border_edges(self):
"""Sets table border edges to True.
"""
for r in range(len(self.rows)):
self.cells[r][0].left = True
self.cells[r][len(self.cols) - 1].right = True
for c in range(len(self.cols)):
self.cells[0][c].top = True
self.cells[len(self.rows) - 1][c].bottom = True
return self
def set_edges(self, vertical, horizontal, jtol=2):
"""Sets a cell's edges to True depending on whether they
overlap with lines found by imgproc.
@ -160,47 +182,47 @@ class Table:
depending on whether the cell spans/extends horizontally or
vertically.
"""
for i in range(len(self.cells)):
for j in range(len(self.cells[i])):
bound = self.cells[i][j].get_bounded_edges()
for r in range(len(self.rows)):
for c in range(len(self.cols)):
bound = self.cells[r][c].get_bounded_edges()
if bound == 4:
continue
elif bound == 3:
if not self.cells[i][j].left:
if (self.cells[i][j].right and
self.cells[i][j].top and
self.cells[i][j].bottom):
self.cells[i][j].spanning_h = True
if not self.cells[r][c].left:
if (self.cells[r][c].right and
self.cells[r][c].top and
self.cells[r][c].bottom):
self.cells[r][c].spanning_h = True
elif not self.cells[i][j].right:
if (self.cells[i][j].left and
self.cells[i][j].top and
self.cells[i][j].bottom):
self.cells[i][j].spanning_h = True
elif not self.cells[r][c].right:
if (self.cells[r][c].left and
self.cells[r][c].top and
self.cells[r][c].bottom):
self.cells[r][c].spanning_h = True
elif not self.cells[i][j].top:
if (self.cells[i][j].left and
self.cells[i][j].right and
self.cells[i][j].bottom):
self.cells[i][j].spanning_v = True
elif not self.cells[r][c].top:
if (self.cells[r][c].left and
self.cells[r][c].right and
self.cells[r][c].bottom):
self.cells[r][c].spanning_v = True
elif not self.cells[i][j].bottom:
if (self.cells[i][j].left and
self.cells[i][j].right and
self.cells[i][j].top):
self.cells[i][j].spanning_v = True
elif not self.cells[r][c].bottom:
if (self.cells[r][c].left and
self.cells[r][c].right and
self.cells[r][c].top):
self.cells[r][c].spanning_v = True
elif bound == 2:
if self.cells[i][j].left and self.cells[i][j].right:
if (not self.cells[i][j].top and
not self.cells[i][j].bottom):
self.cells[i][j].spanning_v = True
if self.cells[r][c].left and self.cells[r][c].right:
if (not self.cells[r][c].top and
not self.cells[r][c].bottom):
self.cells[r][c].spanning_v = True
elif self.cells[i][j].top and self.cells[i][j].bottom:
if (not self.cells[i][j].left and
not self.cells[i][j].right):
self.cells[i][j].spanning_h = True
elif self.cells[r][c].top and self.cells[r][c].bottom:
if (not self.cells[r][c].left and
not self.cells[r][c].right):
self.cells[r][c].spanning_h = True
return self
@ -213,7 +235,7 @@ class Table:
ar : list
"""
ar = []
for i in range(len(self.cells)):
ar.append([self.cells[i][j].get_text().strip()
for j in range(len(self.cells[i]))])
for r in range(len(self.rows)):
ar.append([self.cells[r][c].get_text().strip()
for c in range(len(self.cols))])
return ar

View File

@ -1,5 +1,6 @@
from __future__ import division
import os
import logging
import numpy as np
@ -11,7 +12,8 @@ from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal, LTTextLineVertical
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical)
def translate(x1, x2):
@ -174,22 +176,20 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
return tables_new, v_segments_new, h_segments_new
def get_rotation(ltchar, lttextlh=None, lttextlv=None):
def get_rotation(lttextlh, lttextlv, ltchar):
"""Detects if text in table is vertical or not using the current
transformation matrix (CTM) and returns its orientation.
Parameters
----------
ltchar : list
List of PDFMiner LTChar objects.
lttextlh : list
List of PDFMiner LTTextLineHorizontal objects.
(optional, default: None)
lttextlv : list
List of PDFMiner LTTextLineVertical objects.
(optional, default: None)
ltchar : list
List of PDFMiner LTChar objects.
Returns
-------
@ -199,15 +199,9 @@ def get_rotation(ltchar, lttextlh=None, lttextlv=None):
anti-clockwise and 'right' if rotated 90 degree clockwise.
"""
rotation = ''
if lttextlh is not None and lttextlv is not None:
hlen = len([t for t in lttextlh if t.get_text().strip()])
vlen = len([t for t in lttextlv if t.get_text().strip()])
vger = 0.0
else:
hlen = len([t for t in ltchar if t.upright and t.get_text().strip()])
vlen = len([t for t in ltchar if (not t.upright) and t.get_text().strip()])
vger = vlen / float(hlen+vlen)
if hlen < vlen or vger > 0.8:
if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
rotation = 'left' if clockwise < anticlockwise else 'right'
@ -247,7 +241,183 @@ def segments_bbox(bbox, v_segments, h_segments):
return v_s, h_s
def text_bbox(bbox, text):
def rotate_segments(v_s, h_s, table_rotation):
"""Rotates line segments if the table is rotated.
Parameters
----------
v : list
List of vertical line segments.
h : list
List of horizontal line segments.
table_rotation : string
{'', 'left', 'right'}
Returns
-------
vertical : list
List of rotated vertical line segments.
horizontal : list
List of rotated horizontal line segments.
"""
vertical, horizontal = [], []
if table_rotation != '':
if table_rotation == 'left':
for v in v_s:
x0, y0 = rotate(0, 0, v[0], v[1], -np.pi / 2)
x1, y1 = rotate(0, 0, v[2], v[3], -np.pi / 2)
horizontal.append((x0, y0, x1, y1))
for h in h_s:
x0, y0 = rotate(0, 0, h[0], h[1], -np.pi / 2)
x1, y1 = rotate(0, 0, h[2], h[3], -np.pi / 2)
vertical.append((x1, y1, x0, y0))
elif table_rotation == 'right':
for v in v_s:
x0, y0 = rotate(0, 0, v[0], v[1], np.pi / 2)
x1, y1 = rotate(0, 0, v[2], v[3], np.pi / 2)
horizontal.append((x1, y1, x0, y0))
for h in h_s:
x0, y0 = rotate(0, 0, h[0], h[1], np.pi / 2)
x1, y1 = rotate(0, 0, h[2], h[3], np.pi / 2)
vertical.append((x0, y0, x1, y1))
else:
vertical = v_s
horizontal = h_s
return vertical, horizontal
def rotate_textlines(lh_bbox, lv_bbox, table_rotation):
"""Rotates bounding boxes of LTTextLineHorizontals and
LTTextLineVerticals if the table is rotated.
Parameters
----------
lh_bbox : list
List of PDFMiner LTTextLineHorizontal objects.
lv_bbox : list
List of PDFMiner LTTextLineVertical objects.
table_rotation : string
{'', 'left', 'right'}
Returns
-------
t_bbox : dict
Dict with two keys 'horizontal' and 'vertical' with lists of
LTTextLineHorizontals and LTTextLineVerticals respectively.
"""
t_bbox = {}
if table_rotation != '':
if table_rotation == 'left':
for t in lh_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
t.set_bbox((x1, y0, x0, y1))
for obj in t._objs:
if isinstance(obj, LTChar):
x0, y0, x1, y1 = obj.bbox
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
obj.set_bbox((x1, y0, x0, y1))
for t in lv_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
t.set_bbox((x0, y1, x1, y0))
for obj in t._objs:
if isinstance(obj, LTChar):
x0, y0, x1, y1 = obj.bbox
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
obj.set_bbox((x0, y1, x1, y0))
elif table_rotation == 'right':
for t in lh_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
t.set_bbox((x0, y1, x1, y0))
for obj in t._objs:
if isinstance(obj, LTChar):
x0, y0, x1, y1 = obj.bbox
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
obj.set_bbox((x0, y1, x1, y0))
for t in lv_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
t.set_bbox((x1, y0, x0, y1))
for obj in t._objs:
if isinstance(obj, LTChar):
x0, y0, x1, y1 = obj.bbox
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
obj.set_bbox((x1, y0, x0, y1))
t_bbox['horizontal'] = lv_bbox
t_bbox['vertical'] = lh_bbox
else:
t_bbox['horizontal'] = lh_bbox
t_bbox['vertical'] = lv_bbox
return t_bbox
def rotate_table(R, C, table_rotation):
"""Rotates coordinates of table rows and columns.
Parameters
----------
R : list
List of row x-coordinates.
C : list
List of column y-coordinates.
table_rotation : string
{'', 'left', 'right'}
Returns
-------
rows : list
List of rotated row x-coordinates.
cols : list
List of rotated column y-coordinates.
"""
rows, cols = [], []
if table_rotation != '':
if table_rotation == 'left':
for r in R:
r0, r1 = rotate(0, 0, 0, r[0], -np.pi / 2)
r2, r3 = rotate(0, 0, 0, r[1], -np.pi / 2)
cols.append((r2, r0))
cols = sorted(cols)
for c in C:
c0, c1 = rotate(0, 0, c[0], 0, -np.pi / 2)
c2, c3 = rotate(0, 0, c[1], 0, -np.pi / 2)
rows.append((c1, c3))
elif table_rotation == 'right':
for r in R:
r0, r1 = rotate(0, 0, 0, r[0], np.pi / 2)
r2, r3 = rotate(0, 0, 0, r[1], np.pi / 2)
cols.append((r0, r2))
for c in C:
c0, c1 = rotate(0, 0, c[0], 0, np.pi / 2)
c2, c3 = rotate(0, 0, c[1], 0, np.pi / 2)
rows.append((c3, c1))
rows = sorted(rows, reverse=True)
else:
rows = R
cols = C
return rows, cols
def text_in_bbox(bbox, text):
"""Returns all text objects present inside a
table's bounding box.
@ -330,66 +500,141 @@ def merge_close_values(ar, mtol=2):
return ret
def get_row_index(t, rows):
"""Gets index of the row in which the given text object lies by
comparing their y-coordinates.
def split_textline(table, textline, direction):
"""Splits PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns.
Parameters
----------
t : object
table : object
camelot.pdf.Pdf
rows : list
List of row coordinate tuples, sorted in decreasing order.
textline : object
PDFMiner LTTextLine object.
direction : string
{'horizontal', 'vertical'}
Direction of the PDFMiner LTTextLine object.
Returns
-------
r : int
error : float
cut_text : list
List of tuples of the form (idx, text) where idx is the index
of row/column and text is the an lttextline substring.
"""
offset1, offset2 = 0, 0
for r in range(len(rows)):
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
if t.y0 > rows[r][0]:
offset1 = abs(t.y0 - rows[r][0])
if t.y1 < rows[r][1]:
offset2 = abs(t.y1 - rows[r][1])
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y
error = (X * (offset1 + offset2)) / charea
return r, error
idx = 0
cut_text = []
bbox = textline.bbox
if direction == 'horizontal' and not textline.is_empty():
x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
r = r_idx[0]
x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
if not x_cuts:
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
for obj in textline._objs:
row = table.rows[r]
for cut in x_cuts:
if isinstance(obj, LTChar):
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
(obj.x0 + obj.x1) / 2 <= cut[1]):
cut_text.append((r, cut[0], obj.get_text().strip('\n')))
break
elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj.get_text().strip('\n')))
elif direction == 'vertical' and not textline.is_empty():
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
c = c_idx[0]
y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
if not y_cuts:
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
for obj in textline._objs:
col = table.cols[c]
for cut in y_cuts:
if isinstance(obj, LTChar):
if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
(obj.y0 + obj.y1) / 2 >= cut[1]):
cut_text.append((cut[0], c, obj.get_text()))
break
elif isinstance(obj, LTAnno):
cut_text.append((cut[0], c, obj.get_text().strip('\n')))
return cut_text
def get_column_index(t, columns):
"""Gets index of the column in which the given text object lies by
comparing their x-coordinates.
def get_table_index(table, t, direction, split_text=False):
"""Gets indices of the cell where given text object lies by
comparing their y and x-coordinates.
Parameters
----------
t : object
table : object
camelot.table.Table
columns : list
List of column coordinate tuples.
t : object
PDFMiner LTTextLine object.
direction : string
{'horizontal', 'vertical'}
Direction of the PDFMiner LTTextLine object.
split_text : bool
Whether or not to split a text line if it spans across
multiple cells.
(optional, default: False)
Returns
-------
c : int
indices : list
List of tuples of the form (idx, text) where idx is the index
of row/column and text is the an lttextline substring.
error : float
Assignment error, percentage of text area that lies outside
a cell.
+-------+
| |
| [Text bounding box]
| |
+-------+
"""
offset1, offset2 = 0, 0
for c in range(len(columns)):
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
if t.x0 < columns[c][0]:
offset1 = abs(t.x0 - columns[c][0])
if t.x1 > columns[c][1]:
offset2 = abs(t.x1 - columns[c][1])
r_idx, c_idx = [-1] * 2
for r in range(len(table.rows)):
if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
(t.y0 + t.y1) / 2.0 > table.rows[r][1]):
lt_col_overlap = []
for c in table.cols:
if c[0] <= t.x1 and c[1] >= t.x0:
left = t.x0 if c[0] <= t.x0 else c[0]
right = t.x1 if c[1] >= t.x1 else c[1]
lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
else:
lt_col_overlap.append(-1)
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
logging.warning("Text doesn't fit any column.")
r_idx = r
c_idx = lt_col_overlap.index(max(lt_col_overlap))
break
# error calculation
y0_offset, y1_offset, x0_offset, x1_offset = [0] * 4
if t.y0 > table.rows[r_idx][0]:
y0_offset = abs(t.y0 - table.rows[r_idx][0])
if t.y1 < table.rows[r_idx][1]:
y1_offset = abs(t.y1 - table.rows[r_idx][1])
if t.x0 < table.cols[c_idx][0]:
x0_offset = abs(t.x0 - table.cols[c_idx][0])
if t.x1 > table.cols[c_idx][1]:
x1_offset = abs(t.x1 - table.cols[c_idx][1])
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y
error = (Y * (offset1 + offset2)) / charea
return c, error
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
if split_text:
return split_textline(table, t, direction), error
else:
return [(r_idx, c_idx, t.get_text().strip('\n'))], error
def get_score(error_weights):
@ -448,9 +693,14 @@ def count_empty(d):
Returns
-------
n_empty_rows : number of empty rows
n_empty_cols : number of empty columns
empty_p : percentage of empty cells
n_empty_rows : list
Number of empty rows.
n_empty_cols : list
Number of empty columns.
empty_p : float
Percentage of empty cells.
"""
empty_p = 0
r_nempty_cells, c_nempty_cells = [], []
@ -491,7 +741,7 @@ def encode_list(ar):
return ar
def get_text_objects(layout, LTType="char", t=None):
def get_text_objects(layout, ltype="char", t=None):
"""Recursively parses pdf layout to get a list of
text objects.
@ -500,7 +750,7 @@ def get_text_objects(layout, LTType="char", t=None):
layout : object
PDFMiner LTPage object.
LTType : string
ltype : string
{'char', 'lh', 'lv'}
Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
and LTTextLineVertical objects respectively.
@ -512,11 +762,11 @@ def get_text_objects(layout, LTType="char", t=None):
t : list
List of PDFMiner text objects.
"""
if LTType == "char":
if ltype == "char":
LTObject = LTChar
elif LTType == "lh":
elif ltype == "lh":
LTObject = LTTextLineHorizontal
elif LTType == "lv":
elif ltype == "lv":
LTObject = LTTextLineVertical
if t is None:
t = []
@ -525,7 +775,7 @@ def get_text_objects(layout, LTType="char", t=None):
if isinstance(obj, LTObject):
t.append(obj)
else:
t += get_text_objects(obj, LTType=LTType)
t += get_text_objects(obj, ltype=ltype)
except AttributeError:
pass
return t

View File

@ -123,45 +123,3 @@ def test_lattice_table_rotation():
manager = Pdf(Lattice(), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables['page-1']['table-1']['data'], data)
def test_lattice_cell_rotation():
data = [
["Sl.No.","District","Projected Population for 2012-13(In lakhs)","Adult Equivalent to 88%(In lakhs)","Total Consumptionrequirement(@ 400gms/adult/day)(In Lakh tonnes)","Total Requirement(Including seeds, feeds & wastage)(In Lakh tonnes)","Production (Rice)(In Lakh tonnes)","","","Surplus/Defi cit(In Lakh tonnes)",""],
["","","","","","","Kharif","Rabi","Total","Rice","Paddy"],
["1","Balasore","23.65","20.81","3.04","3.47","2.78","0.86","3.64","0.17","0.25"],
["2","Bhadrak","15.34","13.50","1.97","2.25","3.50","0.05","3.55","1.30","1.94"],
["3","Balangir","17.01","14.97","2.19","2.50","6.23","0.10","6.33","3.83","5.72"],
["4","Subarnapur","6.70","5.90","0.86","0.98","4.48","1.13","5.61","4.63","6.91"],
["5","Cuttack","26.63","23.43","3.42","3.91","3.75","0.06","3.81","-0.10","-0.15"],
["6","Jagatsingpur","11.49","10.11","1.48","1.69","2.10","0.02","2.12","0.43","0.64"],
["7","Jajpur","18.59","16.36","2.39","2.73","2.13","0.04","2.17","-0.56","-0.84"],
["8","Kendrapara","14.62","12.87","1.88","2.15","2.60","0.07","2.67","0.52","0.78"],
["9","Dhenkanal","12.13","10.67","1.56","1.78","2.26","0.02","2.28","0.50","0.75"],
["10","Angul","12.93","11.38","1.66","1.90","1.73","0.02","1.75","-0.15","-0.22"],
["11","Ganjam","35.77","31.48","4.60","5.26","4.57","0.00","4.57","-0.69","-1.03"],
["12","Gajapati","5.85","5.15","0.75","0.86","0.68","0.01","0.69","-0.17","-0.25"],
["13","Kalahandi","16.12","14.19","2.07","2.37","5.42","1.13","6.55","4.18","6.24"],
["14","Nuapada","6.18","5.44","0.79","0.90","1.98","0.08","2.06","1.16","1.73"],
["15","Keonjhar","18.42","16.21","2.37","2.71","2.76","0.08","2.84","0.13","0.19"],
["16","Koraput","14.09","12.40","1.81","2.07","2.08","0.34","2.42","0.35","0.52"],
["17","Malkangiri","6.31","5.55","0.81","0.93","1.78","0.04","1.82","0.89","1.33"],
["18","Nabarangpur","12.50","11.00","1.61","1.84","3.26","0.02","3.28","1.44","2.15"],
["19","Rayagada","9.83","8.65","1.26","1.44","1.15","0.03","1.18","-0.26","-0.39"],
["20","Mayurbhanj","25.61","22.54","3.29","3.76","4.90","0.06","4.96","1.20","1.79"],
["21","Kandhamal","7.45","6.56","0.96","1.10","0.70","0.01","0.71","-0.39","-0.58"],
["22","Boudh","4.51","3.97","0.58","0.66","1.73","0.03","1.76","1.10","1.64"],
["23","Puri","17.29","15.22","2.22","2.54","2.45","0.99","3.44","0.90","1.34"],
["24","Khordha","23.08","20.31","2.97","3.39","2.02","0.03","2.05","-1.34","-2.00"],
["25","Nayagarh","9.78","8.61","1.26","1.44","2.10","0.00","2.10","0.66","0.99"],
["26","Sambalpur","10.62","9.35","1.37","1.57","3.45","0.71","4.16","2.59","3.87"],
["27","Bargarh","15.00","13.20","1.93","2.21","6.87","2.65","9.52","7.31","10.91"],
["28","Deogarh","3.18","2.80","0.41","0.47","1.12","0.07","1.19","0.72","1.07"],
["29","Jharsuguda","5.91","5.20","0.76","0.87","0.99","0.01","1.00","0.13","0.19"],
["30","Sundargarh","21.21","18.66","2.72","3.11","4.72","0.02","4.74","1.63","2.43"],
["ODISHA","","427.80","376.49","54.99","62.86","86.29","8.68","94.97","32.11","47.92"]
]
pdfname = os.path.join(testdir, 'agstat.pdf')
manager = Pdf(Lattice(), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables['page-1']['table-1']['data'], data)

View File

@ -169,45 +169,45 @@ def test_stream_columns():
def test_stream_table_rotation():
data = [
["Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","","",""],
["","","","","","Modern method","","","","","","","Traditional method","","","",""],
["","","Any","","","","","","","Other","Any","","","","Not","","Number"],
["","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"],
["Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"],
["Caste/tribe","","","","","","","","","","","","","","","",""],
["Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"],
["Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"],
["Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"],
["Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"],
["Wealth index","","","","","","","","","","","","","","","",""],
["Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"],
["Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"],
["Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"],
["Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"],
["Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"],
["Number of living children","","","","","","","","","","","","","","","",""],
["No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"],
["1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"],
["1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"],
["No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"],
["2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"],
["1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"],
["No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"],
["3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"],
["1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"],
["No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"],
["4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"],
["1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"],
["No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"],
["Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"],
["NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"],
["NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"],
["","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""],
["not shown separately.","","","","","","","","","","","","","","","",""],
["na = Not available","","","","","","","","","","","","","","","",""],
["","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""],
["( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""],
["","","","","","","","54","","","","","","","","",""]
["","","Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","",""],
["","","","","","","Modern method","","","","","","","Traditional method","","","",""],
["","","","Any","","","","","","","Other","Any","","","","Not","","Number"],
["","","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"],
["","Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"],
["","Caste/tribe","","","","","","","","","","","","","","","",""],
["","Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"],
["","Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"],
["","Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"],
["","Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"],
["","Wealth index","","","","","","","","","","","","","","","",""],
["","Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"],
["","Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"],
["","Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"],
["","Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"],
["","Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"],
["","Number of living children","","","","","","","","","","","","","","","",""],
["","No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"],
["","1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"],
["","1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"],
["","No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"],
["","2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"],
["","1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"],
["","No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"],
["","3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"],
["","1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"],
["","No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"],
["","4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"],
["","1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"],
["","No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"],
["","Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"],
["","NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"],
["","NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"],
["","","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""],
["","not shown separately.","","","","","","","","","","","","","","","",""],
["","na = Not available","","","","","","","","","","","","","","","",""],
["","","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""],
["","( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""],
["","","","","","","","","54","","","","","","","","",""]
]
pdfname = os.path.join(testdir, "left_rotated_table_2.pdf")
manager = Pdf(Stream(), pdfname, clean=True)