Support for vertical tables in Stream
* Change var names * Add test pdf * Add tests for Lattice rotation * Add support for vertical tables in Stream, test pdfs * Add tests for Stream rotationpull/2/head
parent
8ce7b74671
commit
79afb45e2e
|
|
@ -9,9 +9,10 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
|||
find_table_joints)
|
||||
from .table import Table
|
||||
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_bbox,
|
||||
detect_vertical, merge_close_values, get_row_index,
|
||||
get_rotation, merge_close_values, get_row_index,
|
||||
get_column_index, get_score, reduce_index, outline,
|
||||
fill_spanning, count_empty, encode_list, pdf_to_text)
|
||||
fill_spanning, count_empty, encode_list, get_page_layout,
|
||||
get_text_objects)
|
||||
|
||||
|
||||
__all__ = ['Lattice']
|
||||
|
|
@ -62,7 +63,7 @@ class Lattice:
|
|||
page as value.
|
||||
"""
|
||||
def __init__(self, table_area=None, fill=None, mtol=[2], scale=15,
|
||||
invert=False, margins=(2.0, 0.5, 0.1), debug=None):
|
||||
invert=False, margins=(1.0, 0.5, 0.1), debug=None):
|
||||
|
||||
self.method = 'lattice'
|
||||
self.table_area = table_area
|
||||
|
|
@ -82,10 +83,14 @@ class Lattice:
|
|||
Dictionary with page number as key and list of tables on that
|
||||
page as value.
|
||||
"""
|
||||
text, __, width, height = pdf_to_text(pdfname, self.char_margin,
|
||||
self.line_margin, self.word_margin)
|
||||
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
|
||||
line_margin=self.line_margin, word_margin=self.word_margin)
|
||||
ltchar = get_text_objects(layout, LTType="char")
|
||||
lttextlh = get_text_objects(layout, LTType="lh")
|
||||
lttextlv = get_text_objects(layout, LTType="lv")
|
||||
width, height = dim
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
if not text:
|
||||
if not ltchar:
|
||||
logging.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
os.path.basename(bname)))
|
||||
return None
|
||||
|
|
@ -156,9 +161,11 @@ class Lattice:
|
|||
# select elements which lie within table_bbox
|
||||
table_data = {}
|
||||
v_s, h_s = segments_bbox(k, v_segments, h_segments)
|
||||
t_bbox = text_bbox(k, text)
|
||||
table_data['text_p'] = 100 * (1 - (len(t_bbox) / len(text)))
|
||||
table_rotation = detect_vertical(t_bbox)
|
||||
char_bbox = text_bbox(k, ltchar)
|
||||
lh_bbox = text_bbox(k, lttextlh)
|
||||
lv_bbox = text_bbox(k, lttextlv)
|
||||
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
||||
table_rotation = get_rotation(char_bbox, lh_bbox, lv_bbox)
|
||||
cols, rows = zip(*table_bbox[k])
|
||||
cols, rows = list(cols), list(rows)
|
||||
cols.extend([k[0], k[2]])
|
||||
|
|
@ -187,7 +194,7 @@ class Lattice:
|
|||
|
||||
rerror = []
|
||||
cerror = []
|
||||
for t in text:
|
||||
for t in char_bbox:
|
||||
try:
|
||||
r_idx, rass_error = get_row_index(t, rows)
|
||||
except TypeError:
|
||||
|
|
@ -207,7 +214,7 @@ class Lattice:
|
|||
for j in range(len(table.cells[i])):
|
||||
t_bbox = table.cells[i][j].get_objects()
|
||||
try:
|
||||
cell_rotation = detect_vertical(t_bbox)
|
||||
cell_rotation = get_rotation(t_bbox)
|
||||
except ZeroDivisionError:
|
||||
cell_rotation = ''
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -7,8 +7,8 @@ import copy_reg
|
|||
import numpy as np
|
||||
|
||||
from .table import Table
|
||||
from .utils import (get_row_index, get_score, count_empty, encode_list,
|
||||
pdf_to_text, text_bbox)
|
||||
from .utils import (rotate, get_row_index, get_score, count_empty, encode_list,
|
||||
get_page_layout, get_text_objects, text_bbox, get_rotation)
|
||||
|
||||
|
||||
__all__ = ['Stream']
|
||||
|
|
@ -199,7 +199,7 @@ class Stream:
|
|||
page as value.
|
||||
"""
|
||||
def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
|
||||
mtol=[2], margins=(2.0, 0.5, 0.1), debug=False):
|
||||
mtol=[0], margins=(1.0, 0.5, 0.1), debug=False):
|
||||
|
||||
self.method = 'stream'
|
||||
self.table_area = table_area
|
||||
|
|
@ -219,17 +219,20 @@ class Stream:
|
|||
Dictionary with page number as key and list of tables on that
|
||||
page as value.
|
||||
"""
|
||||
__, text, width, height = pdf_to_text(pdfname, self.char_margin,
|
||||
self.line_margin, self.word_margin)
|
||||
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
|
||||
line_margin=self.line_margin, word_margin=self.word_margin)
|
||||
ltchar = get_text_objects(layout, LTType="char")
|
||||
lttextlh = get_text_objects(layout, LTType="lh")
|
||||
lttextlv = get_text_objects(layout, LTType="lv")
|
||||
width, height = dim
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
if not text:
|
||||
if not lttextlh:
|
||||
logging.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
os.path.basename(bname)))
|
||||
return None
|
||||
|
||||
if self.debug:
|
||||
self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
|
||||
return None
|
||||
self.debug_text = []
|
||||
|
||||
if self.table_area is not None:
|
||||
if self.columns is not None:
|
||||
|
|
@ -261,11 +264,35 @@ class Stream:
|
|||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||
# select elements which lie within table_bbox
|
||||
table_data = {}
|
||||
t_bbox = text_bbox(k, text)
|
||||
table_rotation = get_rotation(ltchar, lttextlh, lttextlv)
|
||||
if table_rotation != '':
|
||||
t_bbox = text_bbox(k, lttextlv)
|
||||
if table_rotation == 'left':
|
||||
if self.debug:
|
||||
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
|
||||
for t in t_bbox:
|
||||
x0, y0, x1, y1 = t.bbox
|
||||
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
||||
t.set_bbox((x0, y1, x1, y0))
|
||||
elif table_rotation == 'right':
|
||||
for t in t_bbox:
|
||||
x0, y0, x1, y1 = t.bbox
|
||||
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
||||
t.set_bbox((x1, y0, x0, y1))
|
||||
else:
|
||||
if self.debug:
|
||||
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
|
||||
t_bbox = text_bbox(k, lttextlh)
|
||||
t_bbox.sort(key=lambda x: (-x.y0, x.x0))
|
||||
|
||||
text_x_min = min([t.x0 for t in t_bbox])
|
||||
text_y_min = min([t.y0 for t in t_bbox])
|
||||
text_x_max = max([t.x1 for t in t_bbox])
|
||||
text_y_max = max([t.y1 for t in t_bbox])
|
||||
rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no])
|
||||
rows = _join_rows(rows_grouped, k[3], k[1])
|
||||
rows = _join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
guess = False
|
||||
|
|
@ -275,8 +302,13 @@ class Stream:
|
|||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns[table_no].split(',')
|
||||
cols = [(float(cols[i]), float(cols[i + 1]))
|
||||
for i in range(0, len(cols) - 1)]
|
||||
cols = [float(c) for c in cols]
|
||||
if table_rotation != '':
|
||||
if table_rotation == 'left':
|
||||
cols = [rotate(0, 0, 0, c, -np.pi / 2)[0] for c in cols]
|
||||
elif table_rotation == 'right':
|
||||
cols = [rotate(0, 0, 0, c, np.pi / 2)[0] for c in cols]
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
else:
|
||||
if self.ncolumns is not None and self.ncolumns[table_no] != -1:
|
||||
ncols = self.ncolumns[table_no]
|
||||
|
|
@ -288,7 +320,7 @@ class Stream:
|
|||
" isn't the same as what you specified."
|
||||
" Change the value of mtol.".format(
|
||||
os.path.basename(bname)))
|
||||
cols = _join_columns(cols, k[0], k[2])
|
||||
cols = _join_columns(cols, text_x_min, text_x_max)
|
||||
else:
|
||||
guess = True
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
|
|
@ -310,7 +342,7 @@ class Stream:
|
|||
outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||
inner_text.extend(outer_text)
|
||||
cols = _add_columns(cols, inner_text, self.ytol[table_no])
|
||||
cols = _join_columns(cols, k[0], k[2])
|
||||
cols = _join_columns(cols, text_x_min, text_x_max)
|
||||
|
||||
table = Table(cols, rows)
|
||||
rerror = []
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ from pdfminer.pdfinterp import PDFResourceManager
|
|||
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
|
||||
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal, LTTextLineVertical
|
||||
|
||||
|
||||
def translate(x1, x2):
|
||||
|
|
@ -144,7 +144,7 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
|
|||
return tables_new, v_segments_new, h_segments_new
|
||||
|
||||
|
||||
def detect_vertical(text):
|
||||
def get_rotation(ltchar, lttextlh=None, lttextlv=None):
|
||||
"""Detects if text in table is vertical or not and returns
|
||||
its orientation.
|
||||
|
||||
|
|
@ -156,13 +156,18 @@ def detect_vertical(text):
|
|||
-------
|
||||
rotation : string
|
||||
"""
|
||||
num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
|
||||
num_h = [t for t in text if t.upright and t.get_text().strip()]
|
||||
vger = len(num_v) / float(len(num_v) + len(num_h))
|
||||
rotation = ''
|
||||
if vger > 0.8:
|
||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in text)
|
||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in text)
|
||||
if lttextlh is not None and lttextlv is not None:
|
||||
hlen = len([t for t in lttextlh if t.get_text().strip()])
|
||||
vlen = len([t for t in lttextlv if t.get_text().strip()])
|
||||
vger = 0.0
|
||||
else:
|
||||
hlen = len([t for t in ltchar if t.upright and t.get_text().strip()])
|
||||
vlen = len([t for t in ltchar if (not t.upright) and t.get_text().strip()])
|
||||
vger = vlen / float(hlen+vlen)
|
||||
if hlen < vlen or vger > 0.8:
|
||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
|
||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
|
||||
rotation = 'left' if clockwise < anticlockwise else 'right'
|
||||
return rotation
|
||||
|
||||
|
|
@ -520,7 +525,7 @@ def encode_list(ar):
|
|||
return ar
|
||||
|
||||
|
||||
def extract_text_objects(layout, LTObject, t=None):
|
||||
def get_text_objects(layout, LTType="char", t=None):
|
||||
"""Recursively parses pdf layout to get a list of
|
||||
text objects.
|
||||
|
||||
|
|
@ -539,6 +544,12 @@ def extract_text_objects(layout, LTObject, t=None):
|
|||
t : list
|
||||
List of text objects.
|
||||
"""
|
||||
if LTType == "char":
|
||||
LTObject = LTChar
|
||||
elif LTType == "lh":
|
||||
LTObject = LTTextLineHorizontal
|
||||
elif LTType == "lv":
|
||||
LTObject = LTTextLineVertical
|
||||
if t is None:
|
||||
t = []
|
||||
try:
|
||||
|
|
@ -546,15 +557,14 @@ def extract_text_objects(layout, LTObject, t=None):
|
|||
if isinstance(obj, LTObject):
|
||||
t.append(obj)
|
||||
else:
|
||||
t += extract_text_objects(obj, LTObject)
|
||||
t += get_text_objects(obj, LTType=LTType)
|
||||
except AttributeError:
|
||||
pass
|
||||
return t
|
||||
|
||||
|
||||
def pdf_to_text(pname, char_margin, line_margin, word_margin):
|
||||
# pkey = 'page-{0}'.format(p)
|
||||
# pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
|
||||
def get_page_layout(pname, char_margin=2.0, line_margin=0.5, word_margin=0.1,
|
||||
detect_vertical=True, all_texts=True):
|
||||
with open(pname, 'r') as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
|
|
@ -562,16 +572,16 @@ def pdf_to_text(pname, char_margin, line_margin, word_margin):
|
|||
raise PDFTextExtractionNotAllowed
|
||||
laparams = LAParams(char_margin=char_margin,
|
||||
line_margin=line_margin,
|
||||
word_margin=word_margin)
|
||||
word_margin=word_margin,
|
||||
detect_vertical=detect_vertical,
|
||||
all_texts=all_texts)
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.create_pages(document):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
lattice_objects = extract_text_objects(layout, LTChar)
|
||||
stream_objects = extract_text_objects(
|
||||
layout, LTTextLineHorizontal)
|
||||
width = layout.bbox[2]
|
||||
height = layout.bbox[3]
|
||||
return lattice_objects, stream_objects, width, height
|
||||
dim = (width, height)
|
||||
return layout, dim
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -94,4 +94,74 @@ def test_lattice_invert():
|
|||
pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
|
||||
manager = Pdf(Lattice(invert=True), pdfname, clean=True)
|
||||
tables = manager.extract()
|
||||
assert_equal(tables['page-1']['table-2']['data'], data)
|
||||
assert_equal(tables['page-1']['table-2']['data'], data)
|
||||
|
||||
|
||||
def test_lattice_table_rotation():
|
||||
|
||||
data = [
|
||||
["State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)",""],
|
||||
["","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women"],
|
||||
["Kerala","5738","6633","8864","8297","245","2161","3195","1645","2391"],
|
||||
["Tamil Nadu","7387","10217","5813","7851","413","2134","2858","1119","1739"],
|
||||
["Karnataka","6453","8138","12606","8958","428","2467","2894","1628","2028"],
|
||||
["Andhra Pradesh","5844","9920","9545","8300","557","1899","2493","1111","1529"],
|
||||
["Maharashtra","5161","7796","6883","9525","467","2368","2648","1417","1599"],
|
||||
["Gujarat","4403","5374","4866","9645","477","2687","3021","2122","2503"],
|
||||
["Madhya Pradesh","*","*","*","7942","470","1965","2150","1579","1709"],
|
||||
["Orissa","3756","5540","12024","8473","398","2040","2624","1093","1628"],
|
||||
["West Bengal","*","*","*","8047","423","2058","2743","1413","2027"],
|
||||
["Uttar Pradesh","*","*","*","9860","581","2139","2415","1185","1366"],
|
||||
["Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519"]
|
||||
]
|
||||
pdfname = os.path.join(testdir, 'left_rotated_table_1.pdf')
|
||||
manager = Pdf(Lattice(), pdfname, clean=True)
|
||||
tables = manager.extract()
|
||||
assert_equal(tables['page-1']['table-1']['data'], data)
|
||||
|
||||
pdfname = os.path.join(testdir, 'right_rotated_table_1.pdf')
|
||||
manager = Pdf(Lattice(), pdfname, clean=True)
|
||||
tables = manager.extract()
|
||||
assert_equal(tables['page-1']['table-1']['data'], data)
|
||||
|
||||
def test_lattice_cell_rotation():
|
||||
|
||||
data = [
|
||||
["Sl.No.","District","Projected Population for 2012-13(In lakhs)","Adult Equivalent to 88%(In lakhs)","Total Consumptionrequirement(@ 400gms/adult/day)(In Lakh tonnes)","Total Requirement(Including seeds, feeds & wastage)(In Lakh tonnes)","Production (Rice)(In Lakh tonnes)","","","Surplus/Defi cit(In Lakh tonnes)",""],
|
||||
["","","","","","","Kharif","Rabi","Total","Rice","Paddy"],
|
||||
["1","Balasore","23.65","20.81","3.04","3.47","2.78","0.86","3.64","0.17","0.25"],
|
||||
["2","Bhadrak","15.34","13.50","1.97","2.25","3.50","0.05","3.55","1.30","1.94"],
|
||||
["3","Balangir","17.01","14.97","2.19","2.50","6.23","0.10","6.33","3.83","5.72"],
|
||||
["4","Subarnapur","6.70","5.90","0.86","0.98","4.48","1.13","5.61","4.63","6.91"],
|
||||
["5","Cuttack","26.63","23.43","3.42","3.91","3.75","0.06","3.81","-0.10","-0.15"],
|
||||
["6","Jagatsingpur","11.49","10.11","1.48","1.69","2.10","0.02","2.12","0.43","0.64"],
|
||||
["7","Jajpur","18.59","16.36","2.39","2.73","2.13","0.04","2.17","-0.56","-0.84"],
|
||||
["8","Kendrapara","14.62","12.87","1.88","2.15","2.60","0.07","2.67","0.52","0.78"],
|
||||
["9","Dhenkanal","12.13","10.67","1.56","1.78","2.26","0.02","2.28","0.50","0.75"],
|
||||
["10","Angul","12.93","11.38","1.66","1.90","1.73","0.02","1.75","-0.15","-0.22"],
|
||||
["11","Ganjam","35.77","31.48","4.60","5.26","4.57","0.00","4.57","-0.69","-1.03"],
|
||||
["12","Gajapati","5.85","5.15","0.75","0.86","0.68","0.01","0.69","-0.17","-0.25"],
|
||||
["13","Kalahandi","16.12","14.19","2.07","2.37","5.42","1.13","6.55","4.18","6.24"],
|
||||
["14","Nuapada","6.18","5.44","0.79","0.90","1.98","0.08","2.06","1.16","1.73"],
|
||||
["15","Keonjhar","18.42","16.21","2.37","2.71","2.76","0.08","2.84","0.13","0.19"],
|
||||
["16","Koraput","14.09","12.40","1.81","2.07","2.08","0.34","2.42","0.35","0.52"],
|
||||
["17","Malkangiri","6.31","5.55","0.81","0.93","1.78","0.04","1.82","0.89","1.33"],
|
||||
["18","Nabarangpur","12.50","11.00","1.61","1.84","3.26","0.02","3.28","1.44","2.15"],
|
||||
["19","Rayagada","9.83","8.65","1.26","1.44","1.15","0.03","1.18","-0.26","-0.39"],
|
||||
["20","Mayurbhanj","25.61","22.54","3.29","3.76","4.90","0.06","4.96","1.20","1.79"],
|
||||
["21","Kandhamal","7.45","6.56","0.96","1.10","0.70","0.01","0.71","-0.39","-0.58"],
|
||||
["22","Boudh","4.51","3.97","0.58","0.66","1.73","0.03","1.76","1.10","1.64"],
|
||||
["23","Puri","17.29","15.22","2.22","2.54","2.45","0.99","3.44","0.90","1.34"],
|
||||
["24","Khordha","23.08","20.31","2.97","3.39","2.02","0.03","2.05","-1.34","-2.00"],
|
||||
["25","Nayagarh","9.78","8.61","1.26","1.44","2.10","0.00","2.10","0.66","0.99"],
|
||||
["26","Sambalpur","10.62","9.35","1.37","1.57","3.45","0.71","4.16","2.59","3.87"],
|
||||
["27","Bargarh","15.00","13.20","1.93","2.21","6.87","2.65","9.52","7.31","10.91"],
|
||||
["28","Deogarh","3.18","2.80","0.41","0.47","1.12","0.07","1.19","0.72","1.07"],
|
||||
["29","Jharsuguda","5.91","5.20","0.76","0.87","0.99","0.01","1.00","0.13","0.19"],
|
||||
["30","Sundargarh","21.21","18.66","2.72","3.11","4.72","0.02","4.74","1.63","2.43"],
|
||||
["ODISHA","","427.80","376.49","54.99","62.86","86.29","8.68","94.97","32.11","47.92"]
|
||||
]
|
||||
pdfname = os.path.join(testdir, 'agstat.pdf')
|
||||
manager = Pdf(Lattice(), pdfname, clean=True)
|
||||
tables = manager.extract()
|
||||
assert_equal(tables['page-1']['table-1']['data'], data)
|
||||
|
|
@ -81,11 +81,11 @@ def test_stream_missing_value():
|
|||
["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
|
||||
["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
|
||||
["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
|
||||
["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""],
|
||||
["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""],
|
||||
["4","","","",""]
|
||||
]
|
||||
pdfname = os.path.join(testdir, "missing_values.pdf")
|
||||
manager = Pdf(Stream(margins=(1.0, 0.5, 0.1)), pdfname, clean=True)
|
||||
manager = Pdf(Stream(), pdfname, clean=True)
|
||||
tables = manager.extract()
|
||||
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||
|
||||
|
|
@ -106,8 +106,7 @@ def test_stream_single_table_area():
|
|||
["(each day of the payroll period)",""]
|
||||
]
|
||||
pdfname = os.path.join(testdir, "tabula_test_pdfs/us-007.pdf")
|
||||
manager = Pdf(Stream(table_area=["320,500,573,335"], ytol=[10],
|
||||
margins=(1.0, 0.5, 0.1)),
|
||||
manager = Pdf(Stream(table_area=["320,500,573,335"]),
|
||||
pdfname, pagenos=[{"start": 1, "end": 1}], clean=True)
|
||||
tables = manager.extract()
|
||||
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||
|
|
@ -164,4 +163,58 @@ def test_stream_columns():
|
|||
manager = Pdf(Stream(columns=["28,67,180,230,425,475,700"], ytol=[10]), pdfname,
|
||||
clean=True)
|
||||
tables = manager.extract()
|
||||
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||
|
||||
|
||||
def test_stream_table_rotation():
|
||||
|
||||
data = [
|
||||
["Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","","",""],
|
||||
["","","","","","Modern method","","","","","","","Traditional method","","","",""],
|
||||
["","","Any","","","","","","","Other","Any","","","","Not","","Number"],
|
||||
["","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"],
|
||||
["Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"],
|
||||
["Caste/tribe","","","","","","","","","","","","","","","",""],
|
||||
["Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"],
|
||||
["Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"],
|
||||
["Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"],
|
||||
["Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"],
|
||||
["Wealth index","","","","","","","","","","","","","","","",""],
|
||||
["Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"],
|
||||
["Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"],
|
||||
["Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"],
|
||||
["Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"],
|
||||
["Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"],
|
||||
["Number of living children","","","","","","","","","","","","","","","",""],
|
||||
["No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"],
|
||||
["1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"],
|
||||
["1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"],
|
||||
["No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"],
|
||||
["2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"],
|
||||
["1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"],
|
||||
["No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"],
|
||||
["3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"],
|
||||
["1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"],
|
||||
["No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"],
|
||||
["4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"],
|
||||
["1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"],
|
||||
["No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"],
|
||||
["Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"],
|
||||
["NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"],
|
||||
["NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"],
|
||||
["","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""],
|
||||
["not shown separately.","","","","","","","","","","","","","","","",""],
|
||||
["na = Not available","","","","","","","","","","","","","","","",""],
|
||||
["","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""],
|
||||
["( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""],
|
||||
["","","","","","","","54","","","","","","","","",""]
|
||||
]
|
||||
pdfname = os.path.join(testdir, "left_rotated_table_2.pdf")
|
||||
manager = Pdf(Stream(), pdfname, clean=True)
|
||||
tables = manager.extract()
|
||||
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||
|
||||
pdfname = os.path.join(testdir, "right_rotated_table_2.pdf")
|
||||
manager = Pdf(Stream(), pdfname, clean=True)
|
||||
tables = manager.extract()
|
||||
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||
|
|
@ -34,7 +34,7 @@ options:
|
|||
-l, --log Log to file.
|
||||
-o, --output <directory> Output directory.
|
||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||
grouped together to form a word. [default: 2.0]
|
||||
grouped together to form a word. [default: 1.0]
|
||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
||||
grouped together to form a textbox. [default: 0.5]
|
||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||
|
|
@ -87,7 +87,7 @@ options:
|
|||
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
||||
together. [default: 2]
|
||||
-m, --mtol <mtol> Tolerance to account for when merging columns
|
||||
together. [default: 2]
|
||||
together. [default: 0]
|
||||
-d, --debug Debug by visualizing textboxes.
|
||||
"""
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue