diff --git a/camelot/image_processing.py b/camelot/image_processing.py index d3ae8ef..5f7b247 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -7,8 +7,6 @@ from operator import itemgetter import cv2 import numpy as np -from .utils import merge_tuples - def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): """Thresholds an image using OpenCV's adaptiveThreshold. @@ -102,6 +100,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio _, contours, _ = cv2.findContours( threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) except ValueError: + # for opencv backward compatibility contours, _ = cv2.findContours( threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) @@ -141,6 +140,7 @@ def find_table_contours(vertical, horizontal): __, contours, __ = cv2.findContours( mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) except ValueError: + # for opencv backward compatibility contours, __ = cv2.findContours( mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] @@ -185,6 +185,7 @@ def find_table_joints(contours, vertical, horizontal): __, jc, __ = cv2.findContours( roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) except ValueError: + # for opencv backward compatibility jc, __ = cv2.findContours( roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) if len(jc) <= 4: # remove contours with less than 4 joints @@ -196,80 +197,4 @@ def find_table_joints(contours, vertical, horizontal): joint_coords.append((c1, c2)) tables[(x, y + h, x + w, y)] = joint_coords - return tables - - -def remove_lines(threshold, line_size_scaling=15): - """Removes lines from a thresholded image. - - Parameters - ---------- - threshold : object - numpy.ndarray representing the thresholded image. - line_size_scaling : int, optional (default: 15) - Factor by which the page dimensions will be divided to get - smallest length of lines that should be detected. - - The larger this value, smaller the detected lines. Making it - too large will lead to text being detected as lines. - - Returns - ------- - threshold : object - numpy.ndarray representing the thresholded image - with horizontal and vertical lines removed. - - """ - size = threshold.shape[0] // line_size_scaling - vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) - horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) - dilate_el = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10)) - - vertical = cv2.erode(threshold, vertical_erode_el) - vertical = cv2.dilate(vertical, dilate_el) - - horizontal = cv2.erode(threshold, horizontal_erode_el) - horizontal = cv2.dilate(horizontal, dilate_el) - - threshold = np.bitwise_and(threshold, np.invert(vertical)) - threshold = np.bitwise_and(threshold, np.invert(horizontal)) - return threshold - - -def find_cuts(threshold, char_size_scaling=200): - """Finds cuts made by text projections on y-axis. - - Parameters - ---------- - threshold : object - numpy.ndarray representing the thresholded image. - line_size_scaling : int, optional (default: 200) - Factor by which the page dimensions will be divided to get - smallest length of lines that should be detected. - - The larger this value, smaller the detected lines. Making it - too large will lead to text being detected as lines. - - Returns - ------- - y_cuts : list - List of cuts on y-axis. - """ - size = threshold.shape[0] // char_size_scaling - char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) - - threshold = cv2.erode(threshold, char_el) - threshold = cv2.dilate(threshold, char_el) - - try: - __, contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL, - cv2.CHAIN_APPROX_SIMPLE) - except ValueError: - contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL, - cv2.CHAIN_APPROX_SIMPLE) - - contours = [cv2.boundingRect(c) for c in contours] - y_cuts = [(c[1], c[1] + c[3]) for c in contours] - y_cuts = list(merge_tuples(sorted(y_cuts))) - y_cuts = [(y_cuts[i][0] + y_cuts[i - 1][1]) // 2 for i in range(1, len(y_cuts))] - return sorted(y_cuts, reverse=True) \ No newline at end of file + return tables \ No newline at end of file diff --git a/camelot/utils.py b/camelot/utils.py index 3f30f88..b42b4f8 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -640,25 +640,4 @@ def get_text_objects(layout, ltype="char", t=None): t += get_text_objects(obj, ltype=ltype) except AttributeError: pass - return t - - -def merge_tuples(tuples): - """Merges a list of overlapping tuples. - Parameters - ---------- - tuples : list - List of tuples where a tuple is a single axis coordinate pair. - Yields - ------ - tuple - """ - merged = list(tuples[0]) - for s, e in tuples: - if s <= merged[1]: - merged[1] = max(merged[1], e) - else: - yield tuple(merged) - merged[0] = s - merged[1] = e - yield tuple(merged) \ No newline at end of file + return t \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9a375f7..20bd683 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ click==6.7 matplotlib==2.2.3 numpy==1.15.2 opencv-python==3.4.2.17 +openpyxl==2.5.8 pandas==0.23.4 pdfminer.six==20170720 -PyPDF2==1.26.0 \ No newline at end of file +PyPDF2==1.26.0 diff --git a/tests/data.py b/tests/data.py index d9e723b..8642e8a 100755 --- a/tests/data.py +++ b/tests/data.py @@ -3,17 +3,51 @@ from __future__ import unicode_literals +data_stream = [ + ["", "Table: 5 Public Health Outlay 2012-13 (Budget Estimates) (Rs. in 000)", "", "", "", "", "", ""], + ["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"], + ["", "", "", "", "", "Revenue &", "", ""], + ["", "Medical &", "Family", "Medical &", "Family", "", "", ""], + ["", "", "", "", "", "Capital", "", ""], + ["", "Public", "Welfare", "Public", "Welfare", "", "", ""], + ["", "Health", "", "Health", "", "", "", ""], + ["Andhra Pradesh", "47,824,589", "9,967,837", "1,275,000", "15,000", "59,082,426", "14,898,243", "73,980,669"], + ["Arunachal Pradesh", "2,241,609", "107,549", "23,000", "0", "2,372,158", "86,336", "2,458,494"], + ["Assam", "14,874,821", "2,554,197", "161,600", "0", "17,590,618", "4,408,505", "21,999,123"], + ["Bihar", "21,016,708", "4,332,141", "5,329,000", "0", "30,677,849", "2,251,571", "32,929,420"], + ["Chhattisgarh", "11,427,311", "1,415,660", "2,366,592", "0", "15,209,563", "311,163", "15,520,726"], + ["Delhi", "28,084,780", "411,700", "4,550,000", "0", "33,046,480", "5,000", "33,051,480"], + ["Goa", "4,055,567", "110,000", "330,053", "0", "4,495,620", "12,560", "4,508,180"], + ["Gujarat", "26,328,400", "6,922,900", "12,664,000", "42,000", "45,957,300", "455,860", "46,413,160"], + ["Haryana", "15,156,681", "1,333,527", "40,100", "0", "16,530,308", "1,222,698", "17,753,006"], + ["Himachal Pradesh", "8,647,229", "1,331,529", "580,800", "0", "10,559,558", "725,315", "11,284,873"], + ["Jammu & Kashmir", "14,411,984", "270,840", "3,188,550", "0", "17,871,374", "166,229", "18,037,603"], + ["Jharkhand", "8,185,079", "3,008,077", "3,525,558", "0", "14,718,714", "745,139", "15,463,853"], + ["Karnataka", "34,939,843", "4,317,801", "3,669,700", "0", "42,927,344", "631,088", "43,558,432"], + ["Kerala", "27,923,965", "3,985,473", "929,503", "0", "32,838,941", "334,640", "33,173,581"], + ["Madhya Pradesh", "28,459,540", "4,072,016", "3,432,711", "0", "35,964,267", "472,139", "36,436,406"], + ["Maharashtra", "55,011,100", "6,680,721", "5,038,576", "0", "66,730,397", "313,762", "67,044,159"], + ["Manipur", "2,494,600", "187,700", "897,400", "0", "3,579,700", "0", "3,579,700"], + ["Meghalaya", "2,894,093", "342,893", "705,500", "5,000", "3,947,486", "24,128", "3,971,614"], + ["Mizoram", "1,743,501", "84,185", "10,250", "0", "1,837,936", "17,060", "1,854,996"], + ["Nagaland", "2,368,724", "204,329", "226,400", "0", "2,799,453", "783,054", "3,582,507"], + ["Odisha", "14,317,179", "2,552,292", "1,107,250", "0", "17,976,721", "451,438", "18,428,159"], + ["Puducherry", "4,191,757", "52,249", "192,400", "0", "4,436,406", "2,173", "4,438,579"], + ["Punjab", "19,775,485", "2,208,343", "2,470,882", "0", "24,454,710", "1,436,522", "25,891,232"], + ["", "Health Sector Financing by Centre and States/UTs in India [2009-10 to 2012-13](Revised) P a g e |23", "", "", "", "", "", ""] +] + data_stream_table_rotated = [ ["", "", "Table 21 Current use of contraception by background characteristics\u2014Continued", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], ["", "", "", "", "", "", "Modern method", "", "", "", "", "", "", "Traditional method", "", "", "", ""], - ["", "", "", "Any", "", "", "", "", "", "", "Other", "Any","", "", "", "Not", "", "Number"], + ["", "", "", "Any", "", "", "", "", "", "", "Other", "Any", "", "", "", "Not", "", "Number"], ["", "", "Any", "modern", "Female", "Male", "", "", "", "Condom/", "modern", "traditional", "", "With-", "Folk", "currently", "", "of"], ["", "Background characteristic", "method", "method", "sterilization", "sterilization", "Pill", "IUD", "Injectables", "Nirodh", "method", "method", "Rhythm", "drawal", "method", "using", "Total", "women"], ["", "Caste/tribe", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], ["", "Scheduled caste", "74.8", "55.8", "42.9", "0.9", "9.7", "0.0", "0.2", "2.2", "0.0", "19.0", "11.2", "7.4", "0.4", "25.2", "100.0", "1,363"], ["", "Scheduled tribe", "59.3", "39.0", "26.8", "0.6", "6.4", "0.6", "1.2", "3.5", "0.0", "20.3", "10.4", "5.8", "4.1", "40.7", "100.0", "256"], ["", "Other backward class", "71.4", "51.1", "34.9", "0.0", "8.6", "1.4", "0.0", "6.2", "0.0", "20.4", "12.6", "7.8", "0.0", "28.6", "100.0", "211"], - ["", "Other", "71.1","48.8", "28.2", "0.8", "13.3", "0.9", "0.3", "5.2", "0.1", "22.3", "12.9", "9.1", "0.3", "28.9", "100.0", "3,319"], + ["", "Other", "71.1", "48.8", "28.2", "0.8", "13.3", "0.9", "0.3", "5.2", "0.1", "22.3", "12.9", "9.1", "0.3", "28.9", "100.0", "3,319"], ["", "Wealth index", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], ["", "Lowest", "64.5", "48.6", "34.3", "0.5", "10.5", "0.6", "0.7", "2.0", "0.0", "15.9", "9.9", "4.6", "1.4", "35.5", "100.0", "1,258"], ["", "Second", "68.5", "50.4", "36.2", "1.1", "11.4", "0.5", "0.1", "1.1", "0.0", "18.1", "11.2", "6.7", "0.2", "31.5", "100.0", "1,317"], @@ -47,18 +81,18 @@ data_stream_table_rotated = [ ["", "", "", "", "", "", "", "", "54", "", "", "", "", "", "", "", "", ""] ] -data_stream_table_area_single = [ - ["","One Withholding"], - ["Payroll Period","Allowance"], - ["Weekly","$71.15"], - ["Biweekly","142.31"], - ["Semimonthly","154.17"], - ["Monthly","308.33"], - ["Quarterly","925.00"], - ["Semiannually","1,850.00"], - ["Annually","3,700.00"], - ["Daily or Miscellaneous","14.23"], - ["(each day of the payroll period)",""] +data_stream_table_area = [ + ["", "One Withholding"], + ["Payroll Period", "Allowance"], + ["Weekly", "$71.15"], + ["Biweekly", "142.31"], + ["Semimonthly", "154.17"], + ["Monthly", "308.33"], + ["Quarterly", "925.00"], + ["Semiannually", "1,850.00"], + ["Annually", "3,700.00"], + ["Daily or Miscellaneous", "14.23"], + ["(each day of the payroll period)", ""] ] data_stream_columns = [ @@ -107,82 +141,236 @@ data_stream_columns = [ ["01", "Aguascalientes", "001", "Aguascalientes", "0226", "Hacienda Nueva"] ] +data_stream_split_text = [ + ["FEB", "RUAR", "Y 2014 M27 (BUS)", "", "ALPHABETIC LISTING BY T", "YPE", "", "", "", "ABLPDM27"], + ["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"], + ["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""], + ["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""], + ["NUMBER", "TYPE", "DBA NAME", "LICENSEE NAME", "ADDRESS", "CITY", "ST", "ZIP", "PHONE NUMBER", "EXPIRES"], + ["648765", "AAA", "ALLEGIANT AIR", "ALLEGIANT AIR LLC", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "-", "2014/12/03"], + ["", "", "", "", "7777 EAST APACHE", "", "", "", "", ""], + ["648766", "AAA", "ALLEGIANT AIR", "ALLEGIANT AIR LLC", "STREET", "TULSA", "OK", "74115", "-", "2014/12/16"], + ["82030", "AAA", "AMERICAN AIRLINES", "AMERICAN AIRLINES INC", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "(405) 680-3701", "2014/09/14"], + ["509462", "AAA", "AMERICAN AIRLINES", "AMERICAN AIRLINES INC", "7777 EAST APACHE DRIVE", "TULSA", "OK", "74115", "(918) 831-6302", "2014/08/19"], + ["", "", "", "AMERICAN EAGLE", "", "", "", "", "", ""], + ["509609", "AAA", "AMERICAN EAGLE", "AIRLINES INC", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "(405) 680-3701", "2014/08/19"], + ["", "", "", "AMERICAN EAGLE", "", "", "", "", "", ""], + ["402986", "AAA", "AMERICAN EAGLE", "AIRLINES INC", "7777 EAST APACHE DRIVE", "TULSA", "OK", "74115", "(859) 767-3747", "2014/10/22"], + ["", "", "", "", "WILL ROGERS AIRPORT", "", "", "", "", ""], + ["79145", "AAA", "DELTA AIR LINES", "DELTA AIR LINES INC", "BOX 59975", "OKLAHOMA CITY", "OK", "73159", "(404) 773-9745", "2014/05/11"], + ["600941", "AAA", "ENDEAVOR AIR", "ENDEAVOR AIR INC", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "(901) 348-4100", "2015/03/26"], + ["", "", "", "", "7100 TERMINAL DRIVE", "", "", "", "", ""], + ["478482", "AAA", "EXPRESSJET AIRLINES", "EXPRESSJET AIRLINES INC", "WILL ROGERS AIRPORT", "OKLAHOMA CITY", "OK", "73159", "(832) 353-1201", "2014/05/08"], + ["505981", "AAA", "SKYWEST AIRLINES", "SKYWEST INC", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "(405) 634-3000", "2014/05/28"], + ["429754", "AAA", "SOUTHWEST AIRLINES", "SOUTHWEST AIRLINES CO", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "(405) 682-4183", "2015/02/15"], + ["", "", "TULSA INTERNATIONAL", "", "", "", "", "", "", ""], + ["429755", "AAA", "AIRPORT", "SOUTHWEST AIRLINES CO", "7777 EAST APACHE DRIVE", "TULSA", "OK", "74115", "(918) 834-4495", "2015/02/16"], + ["415051", "AAA", "UNITED AIRLINES", "UNITED AIRLINES INC", "7777 EAST APACHE DRIVE", "TULSA", "OK", "74115", "(872) 825-8309", "2014/05/12"], + ["106719", "AAA", "UNITED AIRLINES", "UNITED AIRLINES INC", "WILL ROGERS AIRPORT", "OKLAHOMA CITY", "OK", "73159", "(872) 825-8309", "2014/04/11"], + ["", "", "A SENSU JAPANESE", "", "7123 SOUTH 92ND EAST", "", "", "", "", ""], + ["625422", "BAW", "RESTAURANT", "INFORMAL PARTNERSHIP", "AVENUE SUITE J", "TULSA", "OK", "74133", "(918) 252-0333", "2015/02/14"], + ["", "", "ADAMO'S ROUTE 66", "", "2132 WEST GARY", "", "", "", "", ""], + ["464828", "BAW", "ITALIAN VILLA", "TADJ INC", "BOULEVARD", "CLINTON", "OK", "73601", "(580) 323-5900", "2015/02/11"], + ["", "", "", "", "12215 NORTH", "", "", "", "", ""], + ["184066", "BAW", "AJANTA", "CABAB N' CURRY INC", "PENNSYLVANIA", "OKLAHOMA CITY", "OK", "73120", "(405) 752-5283", "2014/07/27"], + ["", "", "", "SAYRE LODGING", "", "", "", "", "", ""], + ["547693", "BAW", "AMERICINN OF SAYRE", "ENTERPRISES LLC", "2405 SOUTH EL CAMINO", "SAYRE", "OK", "73662", "(580) 928-2700", "2014/09/08"], + ["", "", "ANDOLINI'S PIZZERIA &", "", "12140 EAST 96TH STREET", "", "", "", "", ""], + ["428377", "BAW", "ITALIAN RESTAURANT", "ANDOLINI'S LLC", "NORTH #106", "OWASSO", "OK", "74055", "(918) 272-9325", "2015/02/10"], + ["", "", "ASAHI JAPANESE", "", "", "", "", "", "", ""], + ["446957", "BAW", "RESTAURANT", "JIN CORPORATION", "7831 EAST 71ST STREET", "TULSA", "OK", "74133", "(918) 307-9151", "2014/12/22"], + ["", "", "", "SMOKEHOUSE", "", "", "", "", "", ""], + ["632501", "BAW", "BACK DOOR BARBECUE", "ASSOCIATES INC", "315 NORTHWEST 23RD", "OKLAHOMA CITY", "OK", "73103", "-", "2014/08/01"], + ["598515", "BAW", "BAMBOO THAI BISTRO", "BAMBOO THAI BISTRO INC", "5079 SOUTH YALE AVENUE", "TULSA", "OK", "74135", "(918) 828-0740", "2015/03/11"], + ["", "", "BANDANA RED'S", "", "", "", "", "", "", ""], + ["618693", "BAW", "STEAKHOUSE", "BRADSHAW, STEVE_LEN", "37808 OLD HIGHWAY 270", "SHAWNEE", "OK", "74804", "-", "2014/08/20"], + ["", "", "", "", "1522 WEST LINDSEY", "", "", "", "", ""], + ["632575", "BAW", "BASHU LEGENDS", "HYH HE CHUANG LLC", "STREET", "NORMAN", "OK", "73069", "-", "2014/07/21"], + ["", "", "", "DEEP FORK HOLDINGS", "", "", "", "", "", ""], + ["543149", "BAW", "BEDLAM BAR-B-Q", "LLC", "610 NORTHEAST 50TH", "OKLAHOMA CITY", "OK", "73105", "(405) 528-7427", "2015/02/23"], + ["", "", "", "", "Page 1 of 151", "", "", "", "", ""] +] + +data_stream_flag_size = [ + ["", "TABLE 125: STATE-WISE COMPOSITION OF OUTSTANDING LIABILITIES - 1997 (Contd.)", "", "", "", "", "", "", "", "", ""], + ["", "", "", "", "(As at end-March)", "", "", "", "", "", ""], + ["", "", "", "", "", "", "", "", "", "", "(` Billion)"], + ["States", "Total", "Market", "NSSF", "WMA", "Loans", "Loans", "Loans", "Loans", "Loans", "Loans"], + ["", "Internal", "Loans", "", "from", "from", "from", "from", "from", "from SBI", "from"], + ["", "Debt", "", "", "RBI", "Banks", "LIC", "GIC", "NABARD", "& Other", "NCDC"], + ["", "", "", "", "", "& FIs", "", "", "", "Banks", ""], + ["1", "2=", "3", "4", "5", "6=", "7", "8", "9", "10", "11"], + ["", "(3 to 6)+14", "", "", "", "(7 to13)", "", "", "", "", ""], + ["Andhra Pradesh", "48.11", "40.45", "-", "3.26", "4.4", "2.62", "-", "0.91", "-", "0.25"], + ["Arunachal Pradesh", "1.23", "1.1", "-", "-", "0.13", "-", "-", "-", "-", "-"], + ["Assam", "12.69", "10.02", "-", "2.41", "0.26", "0.08", "-", "-0.06", "0.01", "0.24"], + ["Bihar", "40.75", "41.54", "-", "-", "-1.42", "0.19", "-", "-1.01", "-0.36", "0.2"], + ["Chhattisgarh", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"], + ["Goa", "1.4", "1.02", "-", "-", "0.38", "0.31", "-", "0.07", "-", "-"], + ["Gujarat", "19.75", "17.1", "-", "-", "2.64", "1.17", "-", "1.11", "-", "0.44"], + ["Haryana", "11.53", "9.67", "-", "0.06", "1.8", "0.55", "-", "0.64", "-", "0.49"], + ["Himachal Pradesh", "8.02", "2.94", "-", "4.55", "0.53", "0.13", "-", "0.05", "-", "0.25"], + ["Jammu and Kashmir", "11.72", "4.49", "-", "-", "7.23", "0.66", "-", "0.02", "6.08", "-"], + ["Jharkhand", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"], + ["Karnataka", "22.44", "19.59", "-", "-", "2.86", "1.22", "-", "0.89", "-", "0.69"], + ["Kerala", "29.03", "24.912", "-", "-", "4.11", "1.77", "-", "0.48", "-", "1.45"], + ["Madhya Pradesh", "27.13", "23.57", "-", "-", "3.56", "0.38", "-", "1.86", "-", "1.28"], + ["Maharashtra", "30.47", "26.07", "-", "-", "4.39", "0.21", "-", "-0.12", "0.02", "2.89"], + ["Manipur", "2.17", "1.61", "-", "0.26", "0.29", "0.08", "-", "-", "-", "0.09"], + ["Meghalaya", "1.36", "1.38", "-", "-", "-0.02", "0.04", "-", "-0.05", "-", "0.03"], + ["Mizoram", "1.17", "0.46", "-", "0.27", "0.43", "0.11", "-", "-", "-", "0.03"], + ["Nagaland", "2.99", "2.6", "-", "-", "0.39", "0.24", "-", "-", "-", "0.04"], + ["Odisha", "34.04", "27.58", "-", "4.4", "2.06", "0.56", "-", "0.66", "-", "0.2"], + ["Punjab", "19.18", "10.93", "-", "1.03", "7.23", "0.17", "-", "0.71", "5.9", "0.46"], + ["Rajasthan", "36.77", "28.63", "-", "4.99", "3.16", "0.57", "-", "1.64", "-", "0.81"], + ["Sikkim", "0.16", "-", "-", "-", "0.16", "0.03", "-", "-", "-", "0.01"], + ["Tamil Nadu", "34.11", "31.41", "-", "-", "2.7", "1.3", "-", "0.6", "-", "0.68"], + ["Tripura", "2.3", "1.89", "-", "-", "0.41", "0.41", "-", "-0.05", "-", "0.02"], + ["Uttaranchal", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"], + ["Uttar Pradesh", "80.62", "74.89", "-", "4.34", "1.34", "0.6", "-", "-0.21", "0.18", "0.03"], + ["West Bengal", "34.23", "32.19", "-", "-", "2.04", "0.77", "-", "0.06", "-", "0.51"], + ["NCT Delhi", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"], + ["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"], + ["2 Includes `2.45 crore outstanding under “Market Loan Suspense”.", "", "", "", "", "", "", "", "", "", ""], + ["", "", "", "", "445", "", "", "", "", "", ""] +] + data_lattice = [ - ["Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""], - ["","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle"], - ["2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%"], - ["2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%"], - ["4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%"], - ["2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"], - ["4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"] + ["Cycle Name", "KI (1/km)", "Distance (mi)", "Percent Fuel Savings", "", "", ""], + ["", "", "", "Improved Speed", "Decreased Accel", "Eliminate Stops", "Decreased Idle"], + ["2012_2", "3.30", "1.3", "5.9%", "9.5%", "29.2%", "17.4%"], + ["2145_1", "0.68", "11.2", "2.4%", "0.1%", "9.5%", "2.7%"], + ["4234_1", "0.59", "58.7", "8.5%", "1.3%", "8.5%", "3.3%"], + ["2032_2", "0.17", "57.8", "21.7%", "0.3%", "2.7%", "1.2%"], + ["4171_1", "0.07", "173.9", "58.1%", "1.6%", "2.1%", "0.5%"] ] data_lattice_table_rotated = [ - ["State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)",""], - ["","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women"], - ["Kerala","5738","6633","8864","8297","245","2161","3195","1645","2391"], - ["Tamil Nadu","7387","10217","5813","7851","413","2134","2858","1119","1739"], - ["Karnataka","6453","8138","12606","8958","428","2467","2894","1628","2028"], - ["Andhra Pradesh","5844","9920","9545","8300","557","1899","2493","1111","1529"], - ["Maharashtra","5161","7796","6883","9525","467","2368","2648","1417","1599"], - ["Gujarat","4403","5374","4866","9645","477","2687","3021","2122","2503"], - ["Madhya Pradesh","*","*","*","7942","470","1965","2150","1579","1709"], - ["Orissa","3756","5540","12024","8473","398","2040","2624","1093","1628"], - ["West Bengal","*","*","*","8047","423","2058","2743","1413","2027"], - ["Uttar Pradesh","*","*","*","9860","581","2139","2415","1185","1366"], - ["Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519"] + ["State", "Nutritional Assessment (No. of individuals)", "", "", "", "IYCF Practices (No. of mothers: 2011-12)", "Blood Pressure (No. of adults: 2011-12)", "", "Fasting Blood Sugar (No. of adults:2011-12)", ""], + ["", "1975-79", "1988-90", "1996-97", "2011-12", "", "Men", "Women", "Men", "Women"], + ["Kerala", "5738", "6633", "8864", "8297", "245", "2161", "3195", "1645", "2391"], + ["Tamil Nadu", "7387", "10217", "5813", "7851", "413", "2134", "2858", "1119", "1739"], + ["Karnataka", "6453", "8138", "12606", "8958", "428", "2467", "2894", "1628", "2028"], + ["Andhra Pradesh", "5844", "9920", "9545", "8300", "557", "1899", "2493", "1111", "1529"], + ["Maharashtra", "5161", "7796", "6883", "9525", "467", "2368", "2648", "1417", "1599"], + ["Gujarat", "4403", "5374", "4866", "9645", "477", "2687", "3021", "2122", "2503"], + ["Madhya Pradesh", "*", "*", "*", "7942", "470", "1965", "2150", "1579", "1709"], + ["Orissa", "3756", "5540", "12024", "8473", "398", "2040", "2624", "1093", "1628"], + ["West Bengal", "*", "*", "*", "8047", "423", "2058", "2743", "1413", "2027"], + ["Uttar Pradesh", "*", "*", "*", "9860", "581", "2139", "2415", "1185", "1366"], + ["Pooled", "38742", "53618", "60601", "86898", "4459", "21918", "27041", "14312", "18519"] +] + +data_lattice_table_area = [ + ["", "", "", "", "", "", "", "", ""], + ["State", "n", "Literacy Status", "", "", "", "", "", ""], + ["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College", ""], + ["Kerala", "2400", "7.2", "0.5", "25.3", "20.1", "41.5", "5.5", ""], + ["Tamil Nadu", "2400", "21.4", "2.3", "8.8", "35.5", "25.8", "6.2", ""], + ["Karnataka", "2399", "37.4", "2.8", "12.5", "18.3", "23.1", "5.8", ""], + ["Andhra Pradesh", "2400", "54.0", "1.7", "8.4", "13.2", "18.8", "3.9", ""], + ["Maharashtra", "2400", "22.0", "0.9", "17.3", "20.3", "32.6", "7.0", ""], + ["Gujarat", "2390", "28.6", "0.1", "14.4", "23.1", "26.9", "6.8", ""], + ["Madhya Pradesh", "2402", "29.1", "3.4", "8.5", "35.1", "13.3", "10.6", ""], + ["Orissa", "2405", "33.2", "1.0", "10.4", "25.7", "21.2", "8.5", ""], + ["West Bengal", "2293", "41.7", "4.4", "13.2", "17.1", "21.2", "2.4", ""], + ["Uttar Pradesh", "2400", "35.3", "2.1", "4.5", "23.3", "27.1", "7.6", ""], + ["Pooled", "23889", "30.9", "1.9", "12.3", "23.2", "25.2", "6.4", ""], + ["", "", "", "", "", "", "", "", ""] ] data_lattice_process_background = [ - ["State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"], - ["Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000"], - ["Rajasthan","2.12.2009 to 19.12.2009","","","","","",""], - ["Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453"], - ["Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153"], - ["Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183"], - ["Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855"], - ["Total","","47","92","11.81","22,455","19,584","10,644"] + ["State", "Date", "Halt stations", "Halt days", "Persons directly reached(in lakh)", "Persons trained", "Persons counseled" ,"Persons testedfor HIV"], + ["Delhi", "1.12.2009", "8", "17", "1.29", "3,665", "2,409", "1,000"], + ["Rajasthan", "2.12.2009 to 19.12.2009", "", "", "", "", "", ""], + ["Gujarat", "20.12.2009 to 3.1.2010", "6", "13", "6.03", "3,810", "2,317", "1,453"], + ["Maharashtra", "4.01.2010 to 1.2.2010", "13", "26", "1.27", "5,680", "9,027", "4,153"], + ["Karnataka", "2.2.2010 to 22.2.2010", "11", "19", "1.80", "5,741", "3,658", "3,183"], + ["Kerala", "23.2.2010 to 11.3.2010", "9", "17", "1.42", "3,559", "2,173", "855"], + ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"] ] data_lattice_copy_text = [ - ["Plan Type","County","Plan Name","Totals"], - ["GMC","Sacramento","Anthem Blue Cross","164,380"], - ["GMC","Sacramento","Health Net","126,547"], - ["GMC","Sacramento","Kaiser Foundation","74,620"], - ["GMC","Sacramento","Molina Healthcare","59,989"], - ["GMC","San Diego","Care 1st Health Plan","71,831"], - ["GMC","San Diego","Community Health Group","264,639"], - ["GMC","San Diego","Health Net","72,404"], - ["GMC","San Diego","Kaiser","50,415"], - ["GMC","San Diego","Molina Healthcare","206,430"], - ["GMC","Total GMC Enrollment","","1,091,255"], - ["COHS","Marin","Partnership Health Plan of CA","36,006"], - ["COHS","Mendocino","Partnership Health Plan of CA","37,243"], - ["COHS","Napa","Partnership Health Plan of CA","28,398"], - ["COHS","Solano","Partnership Health Plan of CA","113,220"], - ["COHS","Sonoma","Partnership Health Plan of CA","112,271"], - ["COHS","Yolo","Partnership Health Plan of CA","52,674"], - ["COHS","Del Norte","Partnership Health Plan of CA","11,242"], - ["COHS","Humboldt","Partnership Health Plan of CA","49,911"], - ["COHS","Lake","Partnership Health Plan of CA","29,149"], - ["COHS","Lassen","Partnership Health Plan of CA","7,360"], - ["COHS","Modoc","Partnership Health Plan of CA","2,940"], - ["COHS","Shasta","Partnership Health Plan of CA","61,763"], - ["COHS","Siskiyou","Partnership Health Plan of CA","16,715"], - ["COHS","Trinity","Partnership Health Plan of CA","4,542"], - ["COHS","Merced","Central California Alliance for Health","123,907"], - ["COHS","Monterey","Central California Alliance for Health","147,397"], - ["COHS","Santa Cruz","Central California Alliance for Health","69,458"], - ["COHS","Santa Barbara","CenCal","117,609"], - ["COHS","San Luis Obispo","CenCal","55,761"], - ["COHS","Orange","CalOptima","783,079"], - ["COHS","San Mateo","Health Plan of San Mateo","113,202"], - ["COHS","Ventura","Gold Coast Health Plan","202,217"], - ["COHS","Total COHS Enrollment","","2,176,064"], - ["Subtotal for Two-Plan, Regional Model, GMC and COHS","","","10,132,022"], - ["PCCM","Los Angeles","AIDS Healthcare Foundation","828"], - ["PCCM","San Francisco","Family Mosaic","25"], - ["PCCM","Total PHP Enrollment","","853"], - ["All Models Total Enrollments","","","10,132,875"], - ["Source: Data Warehouse 12/14/15","","",""] + ["Plan Type", "County", "Plan Name", "Totals"], + ["GMC", "Sacramento", "Anthem Blue Cross", "164,380"], + ["GMC", "Sacramento", "Health Net", "126,547"], + ["GMC", "Sacramento", "Kaiser Foundation", "74,620"], + ["GMC", "Sacramento", "Molina Healthcare", "59,989"], + ["GMC", "San Diego", "Care 1st Health Plan", "71,831"], + ["GMC", "San Diego", "Community Health Group", "264,639"], + ["GMC", "San Diego", "Health Net", "72,404"], + ["GMC", "San Diego", "Kaiser", "50,415"], + ["GMC", "San Diego", "Molina Healthcare", "206,430"], + ["GMC", "Total GMC Enrollment", "", "1,091,255"], + ["COHS", "Marin", "Partnership Health Plan of CA", "36,006"], + ["COHS", "Mendocino", "Partnership Health Plan of CA", "37,243"], + ["COHS", "Napa", "Partnership Health Plan of CA", "28,398"], + ["COHS", "Solano", "Partnership Health Plan of CA", "113,220"], + ["COHS", "Sonoma", "Partnership Health Plan of CA", "112,271"], + ["COHS", "Yolo", "Partnership Health Plan of CA", "52,674"], + ["COHS", "Del Norte", "Partnership Health Plan of CA", "11,242"], + ["COHS", "Humboldt", "Partnership Health Plan of CA", "49,911"], + ["COHS", "Lake", "Partnership Health Plan of CA", "29,149"], + ["COHS", "Lassen", "Partnership Health Plan of CA", "7,360"], + ["COHS", "Modoc", "Partnership Health Plan of CA", "2,940"], + ["COHS", "Shasta", "Partnership Health Plan of CA", "61,763"], + ["COHS", "Siskiyou", "Partnership Health Plan of CA", "16,715"], + ["COHS", "Trinity", "Partnership Health Plan of CA", "4,542"], + ["COHS", "Merced", "Central California Alliance for Health", "123,907"], + ["COHS", "Monterey", "Central California Alliance for Health", "147,397"], + ["COHS", "Santa Cruz", "Central California Alliance for Health", "69,458"], + ["COHS", "Santa Barbara", "CenCal", "117,609"], + ["COHS", "San Luis Obispo", "CenCal", "55,761"], + ["COHS", "Orange", "CalOptima", "783,079"], + ["COHS", "San Mateo", "Health Plan of San Mateo", "113,202"], + ["COHS", "Ventura", "Gold Coast Health Plan", "202,217"], + ["COHS", "Total COHS Enrollment", "", "2,176,064"], + ["Subtotal for Two-Plan, Regional Model, GMC and COHS", "", "", "10,132,022"], + ["PCCM", "Los Angeles", "AIDS Healthcare Foundation", "828"], + ["PCCM", "San Francisco", "Family Mosaic", "25"], + ["PCCM", "Total PHP Enrollment", "", "853"], + ["All Models Total Enrollments", "", "", "10,132,875"], + ["Source: Data Warehouse 12/14/15", "", "", ""] +] + +data_lattice_shift_text_left_top = [ + ["Investigations", "No. ofHHs", "Age/Sex/Physiological Group", "Preva-lence", "C.I*", "RelativePrecision", "Sample sizeper State"], + ["Anthropometry", "2400", "All the available individuals", "", "", "", ""], + ["Clinical Examination", "", "", "", "", "", ""], + ["History of morbidity", "", "", "", "", "", ""], + ["Diet survey", "1200", "All the individuals partaking meals in the HH", "", "", "", ""], + ["Blood Pressure #", "2400", "Men (≥ 18yrs)", "10%", "95%", "20%", "1728"], + ["", "", "Women (≥ 18 yrs)", "", "", "", "1728"], + ["Fasting blood glucose", "2400", "Men (≥ 18 yrs)", "5%", "95%", "20%", "1825"], + ["", "", "Women (≥ 18 yrs)", "", "", "", "1825"], + ["Knowledge &Practices on HTN &DM", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"], + ["", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"] +] + +data_lattice_shift_text_disable = [ + ["Investigations", "No. ofHHs", "Age/Sex/Physiological Group", "Preva-lence", "C.I*", "RelativePrecision", "Sample sizeper State"], + ["Anthropometry", "", "", "", "", "", ""], + ["Clinical Examination", "2400", "", "All the available individuals", "", "", ""], + ["History of morbidity", "", "", "", "", "", ""], + ["Diet survey", "1200", "", "All the individuals partaking meals in the HH", "", "", ""], + ["", "", "Men (≥ 18yrs)", "", "", "", "1728"], + ["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%", "1728"], + ["", "", "Men (≥ 18 yrs)", "", "", "", "1825"], + ["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"], + ["Knowledge &Practices on HTN &", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"], + ["DM", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"] +] + +data_lattice_shift_text_right_bottom = [ + ["Investigations", "No. ofHHs", "Age/Sex/Physiological Group", "Preva-lence", "C.I*", "RelativePrecision", "Sample sizeper State"], + ["Anthropometry", "", "", "", "", "", ""], + ["Clinical Examination", "", "", "", "", "", ""], + ["History of morbidity", "2400", "", "", "", "", "All the available individuals"], + ["Diet survey", "1200", "", "", "", "", "All the individuals partaking meals in the HH"], + ["", "", "Men (≥ 18yrs)", "", "", "", "1728"], + ["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%", "1728"], + ["", "", "Men (≥ 18 yrs)", "", "", "", "1825"], + ["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"], + ["", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"], + ["Knowledge &Practices on HTN &DM", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"] ] \ No newline at end of file diff --git a/tests/files/superscript.pdf b/tests/files/superscript.pdf new file mode 100755 index 0000000..855a3bd Binary files /dev/null and b/tests/files/superscript.pdf differ diff --git a/tests/test_cli.py b/tests/test_cli.py index 4d1b7d9..da572dd 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -50,3 +50,30 @@ def test_cli_stream(): result = runner.invoke(cli, ['--output', outfile, 'stream', infile]) format_error = 'Please specify output file format using --format' assert format_error in result.output + + +def test_cli_output_format(): + with TemporaryDirectory() as tempdir: + infile = os.path.join(testdir, 'health.pdf') + outfile = os.path.join(tempdir, 'health.{}') + runner = CliRunner() + + # json + result = runner.invoke(cli, ['--format', 'json', '--output', outfile.format('json'), + 'stream', infile]) + assert result.exit_code == 0 + + # excel + result = runner.invoke(cli, ['--format', 'excel', '--output', outfile.format('xlsx'), + 'stream', infile]) + assert result.exit_code == 0 + + # html + result = runner.invoke(cli, ['--format', 'html', '--output', outfile.format('html'), + 'stream', infile]) + assert result.exit_code == 0 + + # zip + result = runner.invoke(cli, ['--zip', '--format', 'csv', '--output', outfile.format('csv'), + 'stream', infile]) + assert result.exit_code == 0 \ No newline at end of file diff --git a/tests/test_common.py b/tests/test_common.py index afcb611..d18090d 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -12,8 +12,25 @@ testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.join(testdir, "files") +def test_parsing_report(): + parsing_report = { + 'accuracy': 99.02, + 'whitespace': 12.24, + 'order': 1, + 'page': 1 + } + + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename) + assert tables[0].parsing_report == parsing_report + + def test_stream(): - pass + df = pd.DataFrame(data_stream) + + filename = os.path.join(testdir, "health.pdf") + tables = camelot.read_pdf(filename, flavor="stream") + assert df.equals(tables[0].df) def test_stream_table_rotated(): @@ -29,7 +46,7 @@ def test_stream_table_rotated(): def test_stream_table_area(): - df = pd.DataFrame(data_stream_table_area_single) + df = pd.DataFrame(data_stream_table_area) filename = os.path.join(testdir, "tabula/us-007.pdf") tables = camelot.read_pdf(filename, flavor="stream", table_area=["320,500,573,335"]) @@ -45,6 +62,23 @@ def test_stream_columns(): assert df.equals(tables[0].df) +def test_stream_split_text(): + df = pd.DataFrame(data_stream_split_text) + + filename = os.path.join(testdir, "tabula/m27.pdf") + tables = camelot.read_pdf( + filename, flavor="stream", columns=["72,95,209,327,442,529,566,606,683"], split_text=True) + assert df.equals(tables[0].df) + + +def test_stream_flag_size(): + df = pd.DataFrame(data_stream_flag_size) + + filename = os.path.join(testdir, "superscript.pdf") + tables = camelot.read_pdf(filename, flavor="stream", flag_size=True) + assert df.equals(tables[0].df) + + def test_lattice(): df = pd.DataFrame(data_lattice) @@ -66,6 +100,14 @@ def test_lattice_table_rotated(): assert df.equals(tables[0].df) +def test_lattice_table_area(): + df = pd.DataFrame(data_lattice_table_area) + + filename = os.path.join(testdir, "twotables_2.pdf") + tables = camelot.read_pdf(filename, table_area=["80,693,535,448"]) + assert df.equals(tables[0].df) + + def test_lattice_process_background(): df = pd.DataFrame(data_lattice_process_background) @@ -79,4 +121,20 @@ def test_lattice_copy_text(): filename = os.path.join(testdir, "row_span_1.pdf") tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v") - assert df.equals(tables[0].df) \ No newline at end of file + assert df.equals(tables[0].df) + + +def test_lattice_shift_text(): + df_lt = pd.DataFrame(data_lattice_shift_text_left_top) + df_disable = pd.DataFrame(data_lattice_shift_text_disable) + df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom) + + filename = os.path.join(testdir, "column_span_2.pdf") + tables = camelot.read_pdf(filename, line_size_scaling=40) + assert df_lt.equals(tables[0].df) + + tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['']) + assert df_disable.equals(tables[0].df) + + tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b']) + assert df_rb.equals(tables[0].df) \ No newline at end of file diff --git a/tests/test_plotting.py b/tests/test_plotting.py deleted file mode 100755 index 7c68785..0000000 --- a/tests/test_plotting.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- \ No newline at end of file