diff --git a/camelot/lattice.py b/camelot/lattice.py index 3819fbb..e372b9b 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -9,9 +9,10 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours, find_table_joints) from .table import Table from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_bbox, - detect_vertical, merge_close_values, get_row_index, + get_rotation, merge_close_values, get_row_index, get_column_index, get_score, reduce_index, outline, - fill_spanning, count_empty, encode_list, pdf_to_text) + fill_spanning, count_empty, encode_list, get_page_layout, + get_text_objects) __all__ = ['Lattice'] @@ -62,7 +63,7 @@ class Lattice: page as value. """ def __init__(self, table_area=None, fill=None, mtol=[2], scale=15, - invert=False, margins=(2.0, 0.5, 0.1), debug=None): + invert=False, margins=(1.0, 0.5, 0.1), debug=None): self.method = 'lattice' self.table_area = table_area @@ -82,10 +83,14 @@ class Lattice: Dictionary with page number as key and list of tables on that page as value. """ - text, __, width, height = pdf_to_text(pdfname, self.char_margin, - self.line_margin, self.word_margin) + layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, + line_margin=self.line_margin, word_margin=self.word_margin) + ltchar = get_text_objects(layout, LTType="char") + lttextlh = get_text_objects(layout, LTType="lh") + lttextlv = get_text_objects(layout, LTType="lv") + width, height = dim bname, __ = os.path.splitext(pdfname) - if not text: + if not ltchar: logging.warning("{0}: PDF has no text. It may be an image.".format( os.path.basename(bname))) return None @@ -156,9 +161,11 @@ class Lattice: # select elements which lie within table_bbox table_data = {} v_s, h_s = segments_bbox(k, v_segments, h_segments) - t_bbox = text_bbox(k, text) - table_data['text_p'] = 100 * (1 - (len(t_bbox) / len(text))) - table_rotation = detect_vertical(t_bbox) + char_bbox = text_bbox(k, ltchar) + lh_bbox = text_bbox(k, lttextlh) + lv_bbox = text_bbox(k, lttextlv) + table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar))) + table_rotation = get_rotation(char_bbox, lh_bbox, lv_bbox) cols, rows = zip(*table_bbox[k]) cols, rows = list(cols), list(rows) cols.extend([k[0], k[2]]) @@ -187,7 +194,7 @@ class Lattice: rerror = [] cerror = [] - for t in text: + for t in char_bbox: try: r_idx, rass_error = get_row_index(t, rows) except TypeError: @@ -207,7 +214,7 @@ class Lattice: for j in range(len(table.cells[i])): t_bbox = table.cells[i][j].get_objects() try: - cell_rotation = detect_vertical(t_bbox) + cell_rotation = get_rotation(t_bbox) except ZeroDivisionError: cell_rotation = '' pass diff --git a/camelot/stream.py b/camelot/stream.py index 010b0d7..efc1eda 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -7,8 +7,8 @@ import copy_reg import numpy as np from .table import Table -from .utils import (get_row_index, get_score, count_empty, encode_list, - pdf_to_text, text_bbox) +from .utils import (rotate, get_row_index, get_score, count_empty, encode_list, + get_page_layout, get_text_objects, text_bbox, get_rotation) __all__ = ['Stream'] @@ -199,7 +199,7 @@ class Stream: page as value. """ def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2], - mtol=[2], margins=(2.0, 0.5, 0.1), debug=False): + mtol=[0], margins=(1.0, 0.5, 0.1), debug=False): self.method = 'stream' self.table_area = table_area @@ -219,17 +219,20 @@ class Stream: Dictionary with page number as key and list of tables on that page as value. """ - __, text, width, height = pdf_to_text(pdfname, self.char_margin, - self.line_margin, self.word_margin) + layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, + line_margin=self.line_margin, word_margin=self.word_margin) + ltchar = get_text_objects(layout, LTType="char") + lttextlh = get_text_objects(layout, LTType="lh") + lttextlv = get_text_objects(layout, LTType="lv") + width, height = dim bname, __ = os.path.splitext(pdfname) - if not text: + if not lttextlh: logging.warning("{0}: PDF has no text. It may be an image.".format( os.path.basename(bname))) return None if self.debug: - self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text] - return None + self.debug_text = [] if self.table_area is not None: if self.columns is not None: @@ -261,11 +264,35 @@ class Stream: for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): # select elements which lie within table_bbox table_data = {} - t_bbox = text_bbox(k, text) + table_rotation = get_rotation(ltchar, lttextlh, lttextlv) + if table_rotation != '': + t_bbox = text_bbox(k, lttextlv) + if table_rotation == 'left': + if self.debug: + self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv]) + for t in t_bbox: + x0, y0, x1, y1 = t.bbox + x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2) + x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2) + t.set_bbox((x0, y1, x1, y0)) + elif table_rotation == 'right': + for t in t_bbox: + x0, y0, x1, y1 = t.bbox + x0, y0 = rotate(0, 0, x0, y0, np.pi / 2) + x1, y1 = rotate(0, 0, x1, y1, np.pi / 2) + t.set_bbox((x1, y0, x0, y1)) + else: + if self.debug: + self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh]) + t_bbox = text_bbox(k, lttextlh) t_bbox.sort(key=lambda x: (-x.y0, x.x0)) + text_x_min = min([t.x0 for t in t_bbox]) + text_y_min = min([t.y0 for t in t_bbox]) + text_x_max = max([t.x1 for t in t_bbox]) + text_y_max = max([t.y1 for t in t_bbox]) rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no]) - rows = _join_rows(rows_grouped, k[3], k[1]) + rows = _join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] guess = False @@ -275,8 +302,13 @@ class Stream: # similar to else condition # len can't be 1 cols = self.columns[table_no].split(',') - cols = [(float(cols[i]), float(cols[i + 1])) - for i in range(0, len(cols) - 1)] + cols = [float(c) for c in cols] + if table_rotation != '': + if table_rotation == 'left': + cols = [rotate(0, 0, 0, c, -np.pi / 2)[0] for c in cols] + elif table_rotation == 'right': + cols = [rotate(0, 0, 0, c, np.pi / 2)[0] for c in cols] + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] else: if self.ncolumns is not None and self.ncolumns[table_no] != -1: ncols = self.ncolumns[table_no] @@ -288,7 +320,7 @@ class Stream: " isn't the same as what you specified." " Change the value of mtol.".format( os.path.basename(bname))) - cols = _join_columns(cols, k[0], k[2]) + cols = _join_columns(cols, text_x_min, text_x_max) else: guess = True ncols = max(set(elements), key=elements.count) @@ -310,7 +342,7 @@ class Stream: outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] inner_text.extend(outer_text) cols = _add_columns(cols, inner_text, self.ytol[table_no]) - cols = _join_columns(cols, k[0], k[2]) + cols = _join_columns(cols, text_x_min, text_x_max) table = Table(cols, rows) rerror = [] diff --git a/camelot/utils.py b/camelot/utils.py index 584c95c..8551851 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -11,7 +11,7 @@ from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal +from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal, LTTextLineVertical def translate(x1, x2): @@ -144,7 +144,7 @@ def scale_to_pdf(tables, v_segments, h_segments, factors): return tables_new, v_segments_new, h_segments_new -def detect_vertical(text): +def get_rotation(ltchar, lttextlh=None, lttextlv=None): """Detects if text in table is vertical or not and returns its orientation. @@ -156,13 +156,18 @@ def detect_vertical(text): ------- rotation : string """ - num_v = [t for t in text if (not t.upright) and t.get_text().strip()] - num_h = [t for t in text if t.upright and t.get_text().strip()] - vger = len(num_v) / float(len(num_v) + len(num_h)) rotation = '' - if vger > 0.8: - clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in text) - anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in text) + if lttextlh is not None and lttextlv is not None: + hlen = len([t for t in lttextlh if t.get_text().strip()]) + vlen = len([t for t in lttextlv if t.get_text().strip()]) + vger = 0.0 + else: + hlen = len([t for t in ltchar if t.upright and t.get_text().strip()]) + vlen = len([t for t in ltchar if (not t.upright) and t.get_text().strip()]) + vger = vlen / float(hlen+vlen) + if hlen < vlen or vger > 0.8: + clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar) + anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar) rotation = 'left' if clockwise < anticlockwise else 'right' return rotation @@ -520,7 +525,7 @@ def encode_list(ar): return ar -def extract_text_objects(layout, LTObject, t=None): +def get_text_objects(layout, LTType="char", t=None): """Recursively parses pdf layout to get a list of text objects. @@ -539,6 +544,12 @@ def extract_text_objects(layout, LTObject, t=None): t : list List of text objects. """ + if LTType == "char": + LTObject = LTChar + elif LTType == "lh": + LTObject = LTTextLineHorizontal + elif LTType == "lv": + LTObject = LTTextLineVertical if t is None: t = [] try: @@ -546,15 +557,14 @@ def extract_text_objects(layout, LTObject, t=None): if isinstance(obj, LTObject): t.append(obj) else: - t += extract_text_objects(obj, LTObject) + t += get_text_objects(obj, LTType=LTType) except AttributeError: pass return t -def pdf_to_text(pname, char_margin, line_margin, word_margin): - # pkey = 'page-{0}'.format(p) - # pname = os.path.join(self.temp, '{}.pdf'.format(pkey)) +def get_page_layout(pname, char_margin=2.0, line_margin=0.5, word_margin=0.1, + detect_vertical=True, all_texts=True): with open(pname, 'r') as f: parser = PDFParser(f) document = PDFDocument(parser) @@ -562,16 +572,16 @@ def pdf_to_text(pname, char_margin, line_margin, word_margin): raise PDFTextExtractionNotAllowed laparams = LAParams(char_margin=char_margin, line_margin=line_margin, - word_margin=word_margin) + word_margin=word_margin, + detect_vertical=detect_vertical, + all_texts=all_texts) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() - lattice_objects = extract_text_objects(layout, LTChar) - stream_objects = extract_text_objects( - layout, LTTextLineHorizontal) width = layout.bbox[2] height = layout.bbox[3] - return lattice_objects, stream_objects, width, height \ No newline at end of file + dim = (width, height) + return layout, dim \ No newline at end of file diff --git a/tests/agstat.pdf b/tests/agstat.pdf new file mode 100644 index 0000000..cf1c25a Binary files /dev/null and b/tests/agstat.pdf differ diff --git a/tests/left_rotated_table.pdf b/tests/left_rotated_table_1.pdf similarity index 100% rename from tests/left_rotated_table.pdf rename to tests/left_rotated_table_1.pdf diff --git a/tests/left_rotated_table_2.pdf b/tests/left_rotated_table_2.pdf new file mode 100644 index 0000000..a5f7c14 Binary files /dev/null and b/tests/left_rotated_table_2.pdf differ diff --git a/tests/right_rotated_table.pdf b/tests/right_rotated_table_1.pdf similarity index 100% rename from tests/right_rotated_table.pdf rename to tests/right_rotated_table_1.pdf diff --git a/tests/right_rotated_table_2.pdf b/tests/right_rotated_table_2.pdf new file mode 100644 index 0000000..5cee89b Binary files /dev/null and b/tests/right_rotated_table_2.pdf differ diff --git a/tests/test_lattice.py b/tests/test_lattice.py index 419114b..566217a 100644 --- a/tests/test_lattice.py +++ b/tests/test_lattice.py @@ -94,4 +94,74 @@ def test_lattice_invert(): pdfname = os.path.join(testdir, 'lines_in_background_1.pdf') manager = Pdf(Lattice(invert=True), pdfname, clean=True) tables = manager.extract() - assert_equal(tables['page-1']['table-2']['data'], data) \ No newline at end of file + assert_equal(tables['page-1']['table-2']['data'], data) + + +def test_lattice_table_rotation(): + + data = [ + ["State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)",""], + ["","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women"], + ["Kerala","5738","6633","8864","8297","245","2161","3195","1645","2391"], + ["Tamil Nadu","7387","10217","5813","7851","413","2134","2858","1119","1739"], + ["Karnataka","6453","8138","12606","8958","428","2467","2894","1628","2028"], + ["Andhra Pradesh","5844","9920","9545","8300","557","1899","2493","1111","1529"], + ["Maharashtra","5161","7796","6883","9525","467","2368","2648","1417","1599"], + ["Gujarat","4403","5374","4866","9645","477","2687","3021","2122","2503"], + ["Madhya Pradesh","*","*","*","7942","470","1965","2150","1579","1709"], + ["Orissa","3756","5540","12024","8473","398","2040","2624","1093","1628"], + ["West Bengal","*","*","*","8047","423","2058","2743","1413","2027"], + ["Uttar Pradesh","*","*","*","9860","581","2139","2415","1185","1366"], + ["Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519"] + ] + pdfname = os.path.join(testdir, 'left_rotated_table_1.pdf') + manager = Pdf(Lattice(), pdfname, clean=True) + tables = manager.extract() + assert_equal(tables['page-1']['table-1']['data'], data) + + pdfname = os.path.join(testdir, 'right_rotated_table_1.pdf') + manager = Pdf(Lattice(), pdfname, clean=True) + tables = manager.extract() + assert_equal(tables['page-1']['table-1']['data'], data) + +def test_lattice_cell_rotation(): + + data = [ + ["Sl.No.","District","Projected Population for 2012-13(In lakhs)","Adult Equivalent to 88%(In lakhs)","Total Consumptionrequirement(@ 400gms/adult/day)(In Lakh tonnes)","Total Requirement(Including seeds, feeds & wastage)(In Lakh tonnes)","Production (Rice)(In Lakh tonnes)","","","Surplus/Defi cit(In Lakh tonnes)",""], + ["","","","","","","Kharif","Rabi","Total","Rice","Paddy"], + ["1","Balasore","23.65","20.81","3.04","3.47","2.78","0.86","3.64","0.17","0.25"], + ["2","Bhadrak","15.34","13.50","1.97","2.25","3.50","0.05","3.55","1.30","1.94"], + ["3","Balangir","17.01","14.97","2.19","2.50","6.23","0.10","6.33","3.83","5.72"], + ["4","Subarnapur","6.70","5.90","0.86","0.98","4.48","1.13","5.61","4.63","6.91"], + ["5","Cuttack","26.63","23.43","3.42","3.91","3.75","0.06","3.81","-0.10","-0.15"], + ["6","Jagatsingpur","11.49","10.11","1.48","1.69","2.10","0.02","2.12","0.43","0.64"], + ["7","Jajpur","18.59","16.36","2.39","2.73","2.13","0.04","2.17","-0.56","-0.84"], + ["8","Kendrapara","14.62","12.87","1.88","2.15","2.60","0.07","2.67","0.52","0.78"], + ["9","Dhenkanal","12.13","10.67","1.56","1.78","2.26","0.02","2.28","0.50","0.75"], + ["10","Angul","12.93","11.38","1.66","1.90","1.73","0.02","1.75","-0.15","-0.22"], + ["11","Ganjam","35.77","31.48","4.60","5.26","4.57","0.00","4.57","-0.69","-1.03"], + ["12","Gajapati","5.85","5.15","0.75","0.86","0.68","0.01","0.69","-0.17","-0.25"], + ["13","Kalahandi","16.12","14.19","2.07","2.37","5.42","1.13","6.55","4.18","6.24"], + ["14","Nuapada","6.18","5.44","0.79","0.90","1.98","0.08","2.06","1.16","1.73"], + ["15","Keonjhar","18.42","16.21","2.37","2.71","2.76","0.08","2.84","0.13","0.19"], + ["16","Koraput","14.09","12.40","1.81","2.07","2.08","0.34","2.42","0.35","0.52"], + ["17","Malkangiri","6.31","5.55","0.81","0.93","1.78","0.04","1.82","0.89","1.33"], + ["18","Nabarangpur","12.50","11.00","1.61","1.84","3.26","0.02","3.28","1.44","2.15"], + ["19","Rayagada","9.83","8.65","1.26","1.44","1.15","0.03","1.18","-0.26","-0.39"], + ["20","Mayurbhanj","25.61","22.54","3.29","3.76","4.90","0.06","4.96","1.20","1.79"], + ["21","Kandhamal","7.45","6.56","0.96","1.10","0.70","0.01","0.71","-0.39","-0.58"], + ["22","Boudh","4.51","3.97","0.58","0.66","1.73","0.03","1.76","1.10","1.64"], + ["23","Puri","17.29","15.22","2.22","2.54","2.45","0.99","3.44","0.90","1.34"], + ["24","Khordha","23.08","20.31","2.97","3.39","2.02","0.03","2.05","-1.34","-2.00"], + ["25","Nayagarh","9.78","8.61","1.26","1.44","2.10","0.00","2.10","0.66","0.99"], + ["26","Sambalpur","10.62","9.35","1.37","1.57","3.45","0.71","4.16","2.59","3.87"], + ["27","Bargarh","15.00","13.20","1.93","2.21","6.87","2.65","9.52","7.31","10.91"], + ["28","Deogarh","3.18","2.80","0.41","0.47","1.12","0.07","1.19","0.72","1.07"], + ["29","Jharsuguda","5.91","5.20","0.76","0.87","0.99","0.01","1.00","0.13","0.19"], + ["30","Sundargarh","21.21","18.66","2.72","3.11","4.72","0.02","4.74","1.63","2.43"], + ["ODISHA","","427.80","376.49","54.99","62.86","86.29","8.68","94.97","32.11","47.92"] + ] + pdfname = os.path.join(testdir, 'agstat.pdf') + manager = Pdf(Lattice(), pdfname, clean=True) + tables = manager.extract() + assert_equal(tables['page-1']['table-1']['data'], data) \ No newline at end of file diff --git a/tests/test_stream.py b/tests/test_stream.py index 85eca91..3535950 100644 --- a/tests/test_stream.py +++ b/tests/test_stream.py @@ -81,11 +81,11 @@ def test_stream_missing_value(): ["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""], ["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""], ["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""], - ["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""], + ["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""], ["4","","","",""] ] pdfname = os.path.join(testdir, "missing_values.pdf") - manager = Pdf(Stream(margins=(1.0, 0.5, 0.1)), pdfname, clean=True) + manager = Pdf(Stream(), pdfname, clean=True) tables = manager.extract() assert_equal(tables["page-1"]["table-1"]["data"], data) @@ -106,8 +106,7 @@ def test_stream_single_table_area(): ["(each day of the payroll period)",""] ] pdfname = os.path.join(testdir, "tabula_test_pdfs/us-007.pdf") - manager = Pdf(Stream(table_area=["320,500,573,335"], ytol=[10], - margins=(1.0, 0.5, 0.1)), + manager = Pdf(Stream(table_area=["320,500,573,335"]), pdfname, pagenos=[{"start": 1, "end": 1}], clean=True) tables = manager.extract() assert_equal(tables["page-1"]["table-1"]["data"], data) @@ -164,4 +163,58 @@ def test_stream_columns(): manager = Pdf(Stream(columns=["28,67,180,230,425,475,700"], ytol=[10]), pdfname, clean=True) tables = manager.extract() + assert_equal(tables["page-1"]["table-1"]["data"], data) + + +def test_stream_table_rotation(): + + data = [ + ["Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","","",""], + ["","","","","","Modern method","","","","","","","Traditional method","","","",""], + ["","","Any","","","","","","","Other","Any","","","","Not","","Number"], + ["","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"], + ["Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"], + ["Caste/tribe","","","","","","","","","","","","","","","",""], + ["Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"], + ["Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"], + ["Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"], + ["Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"], + ["Wealth index","","","","","","","","","","","","","","","",""], + ["Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"], + ["Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"], + ["Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"], + ["Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"], + ["Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"], + ["Number of living children","","","","","","","","","","","","","","","",""], + ["No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"], + ["1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"], + ["1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"], + ["No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"], + ["2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"], + ["1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"], + ["No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"], + ["3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"], + ["1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"], + ["No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"], + ["4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"], + ["1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"], + ["No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"], + ["Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"], + ["NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"], + ["NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"], + ["","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""], + ["not shown separately.","","","","","","","","","","","","","","","",""], + ["na = Not available","","","","","","","","","","","","","","","",""], + ["","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""], + ["( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""], + ["","","","","","","","54","","","","","","","","",""] + ] + pdfname = os.path.join(testdir, "left_rotated_table_2.pdf") + manager = Pdf(Stream(), pdfname, clean=True) + tables = manager.extract() + assert_equal(tables["page-1"]["table-1"]["data"], data) + + pdfname = os.path.join(testdir, "right_rotated_table_2.pdf") + manager = Pdf(Stream(), pdfname, clean=True) + tables = manager.extract() assert_equal(tables["page-1"]["table-1"]["data"], data) \ No newline at end of file diff --git a/tools/camelot b/tools/camelot index 981fd5c..408a898 100755 --- a/tools/camelot +++ b/tools/camelot @@ -34,7 +34,7 @@ options: -l, --log Log to file. -o, --output Output directory. -M, --cmargin Char margin. Chars closer than cmargin are - grouped together to form a word. [default: 2.0] + grouped together to form a word. [default: 1.0] -L, --lmargin Line margin. Lines closer than lmargin are grouped together to form a textbox. [default: 0.5] -W, --wmargin Word margin. Insert blank spaces between chars @@ -87,7 +87,7 @@ options: -y, --ytol Tolerance to account for when grouping rows together. [default: 2] -m, --mtol Tolerance to account for when merging columns - together. [default: 2] + together. [default: 0] -d, --debug Debug by visualizing textboxes. """