Support for vertical tables in Stream

* Change var names

* Add test pdf

* Add tests for Lattice rotation

* Add support for vertical tables in Stream, test pdfs

* Add tests for Stream rotation
pull/2/head
Vinayak Mehta 2016-09-15 20:51:59 +05:30 committed by GitHub
parent 8ce7b74671
commit 79afb45e2e
11 changed files with 222 additions and 50 deletions

View File

@ -9,9 +9,10 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints) find_table_joints)
from .table import Table from .table import Table
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_bbox, from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_bbox,
detect_vertical, merge_close_values, get_row_index, get_rotation, merge_close_values, get_row_index,
get_column_index, get_score, reduce_index, outline, get_column_index, get_score, reduce_index, outline,
fill_spanning, count_empty, encode_list, pdf_to_text) fill_spanning, count_empty, encode_list, get_page_layout,
get_text_objects)
__all__ = ['Lattice'] __all__ = ['Lattice']
@ -62,7 +63,7 @@ class Lattice:
page as value. page as value.
""" """
def __init__(self, table_area=None, fill=None, mtol=[2], scale=15, def __init__(self, table_area=None, fill=None, mtol=[2], scale=15,
invert=False, margins=(2.0, 0.5, 0.1), debug=None): invert=False, margins=(1.0, 0.5, 0.1), debug=None):
self.method = 'lattice' self.method = 'lattice'
self.table_area = table_area self.table_area = table_area
@ -82,10 +83,14 @@ class Lattice:
Dictionary with page number as key and list of tables on that Dictionary with page number as key and list of tables on that
page as value. page as value.
""" """
text, __, width, height = pdf_to_text(pdfname, self.char_margin, layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
self.line_margin, self.word_margin) line_margin=self.line_margin, word_margin=self.word_margin)
ltchar = get_text_objects(layout, LTType="char")
lttextlh = get_text_objects(layout, LTType="lh")
lttextlv = get_text_objects(layout, LTType="lv")
width, height = dim
bname, __ = os.path.splitext(pdfname) bname, __ = os.path.splitext(pdfname)
if not text: if not ltchar:
logging.warning("{0}: PDF has no text. It may be an image.".format( logging.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname))) os.path.basename(bname)))
return None return None
@ -156,9 +161,11 @@ class Lattice:
# select elements which lie within table_bbox # select elements which lie within table_bbox
table_data = {} table_data = {}
v_s, h_s = segments_bbox(k, v_segments, h_segments) v_s, h_s = segments_bbox(k, v_segments, h_segments)
t_bbox = text_bbox(k, text) char_bbox = text_bbox(k, ltchar)
table_data['text_p'] = 100 * (1 - (len(t_bbox) / len(text))) lh_bbox = text_bbox(k, lttextlh)
table_rotation = detect_vertical(t_bbox) lv_bbox = text_bbox(k, lttextlv)
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
table_rotation = get_rotation(char_bbox, lh_bbox, lv_bbox)
cols, rows = zip(*table_bbox[k]) cols, rows = zip(*table_bbox[k])
cols, rows = list(cols), list(rows) cols, rows = list(cols), list(rows)
cols.extend([k[0], k[2]]) cols.extend([k[0], k[2]])
@ -187,7 +194,7 @@ class Lattice:
rerror = [] rerror = []
cerror = [] cerror = []
for t in text: for t in char_bbox:
try: try:
r_idx, rass_error = get_row_index(t, rows) r_idx, rass_error = get_row_index(t, rows)
except TypeError: except TypeError:
@ -207,7 +214,7 @@ class Lattice:
for j in range(len(table.cells[i])): for j in range(len(table.cells[i])):
t_bbox = table.cells[i][j].get_objects() t_bbox = table.cells[i][j].get_objects()
try: try:
cell_rotation = detect_vertical(t_bbox) cell_rotation = get_rotation(t_bbox)
except ZeroDivisionError: except ZeroDivisionError:
cell_rotation = '' cell_rotation = ''
pass pass

View File

@ -7,8 +7,8 @@ import copy_reg
import numpy as np import numpy as np
from .table import Table from .table import Table
from .utils import (get_row_index, get_score, count_empty, encode_list, from .utils import (rotate, get_row_index, get_score, count_empty, encode_list,
pdf_to_text, text_bbox) get_page_layout, get_text_objects, text_bbox, get_rotation)
__all__ = ['Stream'] __all__ = ['Stream']
@ -199,7 +199,7 @@ class Stream:
page as value. page as value.
""" """
def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2], def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
mtol=[2], margins=(2.0, 0.5, 0.1), debug=False): mtol=[0], margins=(1.0, 0.5, 0.1), debug=False):
self.method = 'stream' self.method = 'stream'
self.table_area = table_area self.table_area = table_area
@ -219,17 +219,20 @@ class Stream:
Dictionary with page number as key and list of tables on that Dictionary with page number as key and list of tables on that
page as value. page as value.
""" """
__, text, width, height = pdf_to_text(pdfname, self.char_margin, layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
self.line_margin, self.word_margin) line_margin=self.line_margin, word_margin=self.word_margin)
ltchar = get_text_objects(layout, LTType="char")
lttextlh = get_text_objects(layout, LTType="lh")
lttextlv = get_text_objects(layout, LTType="lv")
width, height = dim
bname, __ = os.path.splitext(pdfname) bname, __ = os.path.splitext(pdfname)
if not text: if not lttextlh:
logging.warning("{0}: PDF has no text. It may be an image.".format( logging.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname))) os.path.basename(bname)))
return None return None
if self.debug: if self.debug:
self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text] self.debug_text = []
return None
if self.table_area is not None: if self.table_area is not None:
if self.columns is not None: if self.columns is not None:
@ -261,11 +264,35 @@ class Stream:
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
# select elements which lie within table_bbox # select elements which lie within table_bbox
table_data = {} table_data = {}
t_bbox = text_bbox(k, text) table_rotation = get_rotation(ltchar, lttextlh, lttextlv)
if table_rotation != '':
t_bbox = text_bbox(k, lttextlv)
if table_rotation == 'left':
if self.debug:
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
for t in t_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
t.set_bbox((x0, y1, x1, y0))
elif table_rotation == 'right':
for t in t_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
t.set_bbox((x1, y0, x0, y1))
else:
if self.debug:
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
t_bbox = text_bbox(k, lttextlh)
t_bbox.sort(key=lambda x: (-x.y0, x.x0)) t_bbox.sort(key=lambda x: (-x.y0, x.x0))
text_x_min = min([t.x0 for t in t_bbox])
text_y_min = min([t.y0 for t in t_bbox])
text_x_max = max([t.x1 for t in t_bbox])
text_y_max = max([t.y1 for t in t_bbox])
rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no]) rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no])
rows = _join_rows(rows_grouped, k[3], k[1]) rows = _join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped] elements = [len(r) for r in rows_grouped]
guess = False guess = False
@ -275,8 +302,13 @@ class Stream:
# similar to else condition # similar to else condition
# len can't be 1 # len can't be 1
cols = self.columns[table_no].split(',') cols = self.columns[table_no].split(',')
cols = [(float(cols[i]), float(cols[i + 1])) cols = [float(c) for c in cols]
for i in range(0, len(cols) - 1)] if table_rotation != '':
if table_rotation == 'left':
cols = [rotate(0, 0, 0, c, -np.pi / 2)[0] for c in cols]
elif table_rotation == 'right':
cols = [rotate(0, 0, 0, c, np.pi / 2)[0] for c in cols]
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else: else:
if self.ncolumns is not None and self.ncolumns[table_no] != -1: if self.ncolumns is not None and self.ncolumns[table_no] != -1:
ncols = self.ncolumns[table_no] ncols = self.ncolumns[table_no]
@ -288,7 +320,7 @@ class Stream:
" isn't the same as what you specified." " isn't the same as what you specified."
" Change the value of mtol.".format( " Change the value of mtol.".format(
os.path.basename(bname))) os.path.basename(bname)))
cols = _join_columns(cols, k[0], k[2]) cols = _join_columns(cols, text_x_min, text_x_max)
else: else:
guess = True guess = True
ncols = max(set(elements), key=elements.count) ncols = max(set(elements), key=elements.count)
@ -310,7 +342,7 @@ class Stream:
outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text) inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol[table_no]) cols = _add_columns(cols, inner_text, self.ytol[table_no])
cols = _join_columns(cols, k[0], k[2]) cols = _join_columns(cols, text_x_min, text_x_max)
table = Table(cols, rows) table = Table(cols, rows)
rerror = [] rerror = []

View File

@ -11,7 +11,7 @@ from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal, LTTextLineVertical
def translate(x1, x2): def translate(x1, x2):
@ -144,7 +144,7 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
return tables_new, v_segments_new, h_segments_new return tables_new, v_segments_new, h_segments_new
def detect_vertical(text): def get_rotation(ltchar, lttextlh=None, lttextlv=None):
"""Detects if text in table is vertical or not and returns """Detects if text in table is vertical or not and returns
its orientation. its orientation.
@ -156,13 +156,18 @@ def detect_vertical(text):
------- -------
rotation : string rotation : string
""" """
num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
num_h = [t for t in text if t.upright and t.get_text().strip()]
vger = len(num_v) / float(len(num_v) + len(num_h))
rotation = '' rotation = ''
if vger > 0.8: if lttextlh is not None and lttextlv is not None:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in text) hlen = len([t for t in lttextlh if t.get_text().strip()])
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in text) vlen = len([t for t in lttextlv if t.get_text().strip()])
vger = 0.0
else:
hlen = len([t for t in ltchar if t.upright and t.get_text().strip()])
vlen = len([t for t in ltchar if (not t.upright) and t.get_text().strip()])
vger = vlen / float(hlen+vlen)
if hlen < vlen or vger > 0.8:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
rotation = 'left' if clockwise < anticlockwise else 'right' rotation = 'left' if clockwise < anticlockwise else 'right'
return rotation return rotation
@ -520,7 +525,7 @@ def encode_list(ar):
return ar return ar
def extract_text_objects(layout, LTObject, t=None): def get_text_objects(layout, LTType="char", t=None):
"""Recursively parses pdf layout to get a list of """Recursively parses pdf layout to get a list of
text objects. text objects.
@ -539,6 +544,12 @@ def extract_text_objects(layout, LTObject, t=None):
t : list t : list
List of text objects. List of text objects.
""" """
if LTType == "char":
LTObject = LTChar
elif LTType == "lh":
LTObject = LTTextLineHorizontal
elif LTType == "lv":
LTObject = LTTextLineVertical
if t is None: if t is None:
t = [] t = []
try: try:
@ -546,15 +557,14 @@ def extract_text_objects(layout, LTObject, t=None):
if isinstance(obj, LTObject): if isinstance(obj, LTObject):
t.append(obj) t.append(obj)
else: else:
t += extract_text_objects(obj, LTObject) t += get_text_objects(obj, LTType=LTType)
except AttributeError: except AttributeError:
pass pass
return t return t
def pdf_to_text(pname, char_margin, line_margin, word_margin): def get_page_layout(pname, char_margin=2.0, line_margin=0.5, word_margin=0.1,
# pkey = 'page-{0}'.format(p) detect_vertical=True, all_texts=True):
# pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
with open(pname, 'r') as f: with open(pname, 'r') as f:
parser = PDFParser(f) parser = PDFParser(f)
document = PDFDocument(parser) document = PDFDocument(parser)
@ -562,16 +572,16 @@ def pdf_to_text(pname, char_margin, line_margin, word_margin):
raise PDFTextExtractionNotAllowed raise PDFTextExtractionNotAllowed
laparams = LAParams(char_margin=char_margin, laparams = LAParams(char_margin=char_margin,
line_margin=line_margin, line_margin=line_margin,
word_margin=word_margin) word_margin=word_margin,
detect_vertical=detect_vertical,
all_texts=all_texts)
rsrcmgr = PDFResourceManager() rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams) device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document): for page in PDFPage.create_pages(document):
interpreter.process_page(page) interpreter.process_page(page)
layout = device.get_result() layout = device.get_result()
lattice_objects = extract_text_objects(layout, LTChar)
stream_objects = extract_text_objects(
layout, LTTextLineHorizontal)
width = layout.bbox[2] width = layout.bbox[2]
height = layout.bbox[3] height = layout.bbox[3]
return lattice_objects, stream_objects, width, height dim = (width, height)
return layout, dim

BIN
tests/agstat.pdf 100644

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -94,4 +94,74 @@ def test_lattice_invert():
pdfname = os.path.join(testdir, 'lines_in_background_1.pdf') pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
manager = Pdf(Lattice(invert=True), pdfname, clean=True) manager = Pdf(Lattice(invert=True), pdfname, clean=True)
tables = manager.extract() tables = manager.extract()
assert_equal(tables['page-1']['table-2']['data'], data) assert_equal(tables['page-1']['table-2']['data'], data)
def test_lattice_table_rotation():
data = [
["State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)",""],
["","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women"],
["Kerala","5738","6633","8864","8297","245","2161","3195","1645","2391"],
["Tamil Nadu","7387","10217","5813","7851","413","2134","2858","1119","1739"],
["Karnataka","6453","8138","12606","8958","428","2467","2894","1628","2028"],
["Andhra Pradesh","5844","9920","9545","8300","557","1899","2493","1111","1529"],
["Maharashtra","5161","7796","6883","9525","467","2368","2648","1417","1599"],
["Gujarat","4403","5374","4866","9645","477","2687","3021","2122","2503"],
["Madhya Pradesh","*","*","*","7942","470","1965","2150","1579","1709"],
["Orissa","3756","5540","12024","8473","398","2040","2624","1093","1628"],
["West Bengal","*","*","*","8047","423","2058","2743","1413","2027"],
["Uttar Pradesh","*","*","*","9860","581","2139","2415","1185","1366"],
["Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519"]
]
pdfname = os.path.join(testdir, 'left_rotated_table_1.pdf')
manager = Pdf(Lattice(), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables['page-1']['table-1']['data'], data)
pdfname = os.path.join(testdir, 'right_rotated_table_1.pdf')
manager = Pdf(Lattice(), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables['page-1']['table-1']['data'], data)
def test_lattice_cell_rotation():
data = [
["Sl.No.","District","Projected Population for 2012-13(In lakhs)","Adult Equivalent to 88%(In lakhs)","Total Consumptionrequirement(@ 400gms/adult/day)(In Lakh tonnes)","Total Requirement(Including seeds, feeds & wastage)(In Lakh tonnes)","Production (Rice)(In Lakh tonnes)","","","Surplus/Defi cit(In Lakh tonnes)",""],
["","","","","","","Kharif","Rabi","Total","Rice","Paddy"],
["1","Balasore","23.65","20.81","3.04","3.47","2.78","0.86","3.64","0.17","0.25"],
["2","Bhadrak","15.34","13.50","1.97","2.25","3.50","0.05","3.55","1.30","1.94"],
["3","Balangir","17.01","14.97","2.19","2.50","6.23","0.10","6.33","3.83","5.72"],
["4","Subarnapur","6.70","5.90","0.86","0.98","4.48","1.13","5.61","4.63","6.91"],
["5","Cuttack","26.63","23.43","3.42","3.91","3.75","0.06","3.81","-0.10","-0.15"],
["6","Jagatsingpur","11.49","10.11","1.48","1.69","2.10","0.02","2.12","0.43","0.64"],
["7","Jajpur","18.59","16.36","2.39","2.73","2.13","0.04","2.17","-0.56","-0.84"],
["8","Kendrapara","14.62","12.87","1.88","2.15","2.60","0.07","2.67","0.52","0.78"],
["9","Dhenkanal","12.13","10.67","1.56","1.78","2.26","0.02","2.28","0.50","0.75"],
["10","Angul","12.93","11.38","1.66","1.90","1.73","0.02","1.75","-0.15","-0.22"],
["11","Ganjam","35.77","31.48","4.60","5.26","4.57","0.00","4.57","-0.69","-1.03"],
["12","Gajapati","5.85","5.15","0.75","0.86","0.68","0.01","0.69","-0.17","-0.25"],
["13","Kalahandi","16.12","14.19","2.07","2.37","5.42","1.13","6.55","4.18","6.24"],
["14","Nuapada","6.18","5.44","0.79","0.90","1.98","0.08","2.06","1.16","1.73"],
["15","Keonjhar","18.42","16.21","2.37","2.71","2.76","0.08","2.84","0.13","0.19"],
["16","Koraput","14.09","12.40","1.81","2.07","2.08","0.34","2.42","0.35","0.52"],
["17","Malkangiri","6.31","5.55","0.81","0.93","1.78","0.04","1.82","0.89","1.33"],
["18","Nabarangpur","12.50","11.00","1.61","1.84","3.26","0.02","3.28","1.44","2.15"],
["19","Rayagada","9.83","8.65","1.26","1.44","1.15","0.03","1.18","-0.26","-0.39"],
["20","Mayurbhanj","25.61","22.54","3.29","3.76","4.90","0.06","4.96","1.20","1.79"],
["21","Kandhamal","7.45","6.56","0.96","1.10","0.70","0.01","0.71","-0.39","-0.58"],
["22","Boudh","4.51","3.97","0.58","0.66","1.73","0.03","1.76","1.10","1.64"],
["23","Puri","17.29","15.22","2.22","2.54","2.45","0.99","3.44","0.90","1.34"],
["24","Khordha","23.08","20.31","2.97","3.39","2.02","0.03","2.05","-1.34","-2.00"],
["25","Nayagarh","9.78","8.61","1.26","1.44","2.10","0.00","2.10","0.66","0.99"],
["26","Sambalpur","10.62","9.35","1.37","1.57","3.45","0.71","4.16","2.59","3.87"],
["27","Bargarh","15.00","13.20","1.93","2.21","6.87","2.65","9.52","7.31","10.91"],
["28","Deogarh","3.18","2.80","0.41","0.47","1.12","0.07","1.19","0.72","1.07"],
["29","Jharsuguda","5.91","5.20","0.76","0.87","0.99","0.01","1.00","0.13","0.19"],
["30","Sundargarh","21.21","18.66","2.72","3.11","4.72","0.02","4.74","1.63","2.43"],
["ODISHA","","427.80","376.49","54.99","62.86","86.29","8.68","94.97","32.11","47.92"]
]
pdfname = os.path.join(testdir, 'agstat.pdf')
manager = Pdf(Lattice(), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables['page-1']['table-1']['data'], data)

View File

@ -81,11 +81,11 @@ def test_stream_missing_value():
["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""], ["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""], ["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""], ["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""], ["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""],
["4","","","",""] ["4","","","",""]
] ]
pdfname = os.path.join(testdir, "missing_values.pdf") pdfname = os.path.join(testdir, "missing_values.pdf")
manager = Pdf(Stream(margins=(1.0, 0.5, 0.1)), pdfname, clean=True) manager = Pdf(Stream(), pdfname, clean=True)
tables = manager.extract() tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data) assert_equal(tables["page-1"]["table-1"]["data"], data)
@ -106,8 +106,7 @@ def test_stream_single_table_area():
["(each day of the payroll period)",""] ["(each day of the payroll period)",""]
] ]
pdfname = os.path.join(testdir, "tabula_test_pdfs/us-007.pdf") pdfname = os.path.join(testdir, "tabula_test_pdfs/us-007.pdf")
manager = Pdf(Stream(table_area=["320,500,573,335"], ytol=[10], manager = Pdf(Stream(table_area=["320,500,573,335"]),
margins=(1.0, 0.5, 0.1)),
pdfname, pagenos=[{"start": 1, "end": 1}], clean=True) pdfname, pagenos=[{"start": 1, "end": 1}], clean=True)
tables = manager.extract() tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data) assert_equal(tables["page-1"]["table-1"]["data"], data)
@ -164,4 +163,58 @@ def test_stream_columns():
manager = Pdf(Stream(columns=["28,67,180,230,425,475,700"], ytol=[10]), pdfname, manager = Pdf(Stream(columns=["28,67,180,230,425,475,700"], ytol=[10]), pdfname,
clean=True) clean=True)
tables = manager.extract() tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data)
def test_stream_table_rotation():
data = [
["Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","","",""],
["","","","","","Modern method","","","","","","","Traditional method","","","",""],
["","","Any","","","","","","","Other","Any","","","","Not","","Number"],
["","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"],
["Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"],
["Caste/tribe","","","","","","","","","","","","","","","",""],
["Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"],
["Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"],
["Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"],
["Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"],
["Wealth index","","","","","","","","","","","","","","","",""],
["Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"],
["Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"],
["Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"],
["Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"],
["Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"],
["Number of living children","","","","","","","","","","","","","","","",""],
["No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"],
["1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"],
["1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"],
["No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"],
["2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"],
["1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"],
["No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"],
["3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"],
["1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"],
["No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"],
["4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"],
["1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"],
["No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"],
["Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"],
["NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"],
["NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"],
["","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""],
["not shown separately.","","","","","","","","","","","","","","","",""],
["na = Not available","","","","","","","","","","","","","","","",""],
["","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""],
["( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""],
["","","","","","","","54","","","","","","","","",""]
]
pdfname = os.path.join(testdir, "left_rotated_table_2.pdf")
manager = Pdf(Stream(), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data)
pdfname = os.path.join(testdir, "right_rotated_table_2.pdf")
manager = Pdf(Stream(), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data) assert_equal(tables["page-1"]["table-1"]["data"], data)

View File

@ -34,7 +34,7 @@ options:
-l, --log Log to file. -l, --log Log to file.
-o, --output <directory> Output directory. -o, --output <directory> Output directory.
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are -M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
grouped together to form a word. [default: 2.0] grouped together to form a word. [default: 1.0]
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are -L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
grouped together to form a textbox. [default: 0.5] grouped together to form a textbox. [default: 0.5]
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars -W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
@ -87,7 +87,7 @@ options:
-y, --ytol <ytol> Tolerance to account for when grouping rows -y, --ytol <ytol> Tolerance to account for when grouping rows
together. [default: 2] together. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging columns -m, --mtol <mtol> Tolerance to account for when merging columns
together. [default: 2] together. [default: 0]
-d, --debug Debug by visualizing textboxes. -d, --debug Debug by visualizing textboxes.
""" """