[MRG] Add tests for output formats and parser kwargs (#126)

* Remove unused image processing code

* Add opencv back-compat comment

* Add tests for parser special cases

* Fix lattice table area test

* Add tests for output format

* Add openpyxl dep
pull/2/head
Vinayak Mehta 2018-10-05 16:15:30 +05:30 committed by GitHub
parent cf7823f33c
commit 6e8079df84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 365 additions and 188 deletions

View File

@ -7,8 +7,6 @@ from operator import itemgetter
import cv2 import cv2
import numpy as np import numpy as np
from .utils import merge_tuples
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
"""Thresholds an image using OpenCV's adaptiveThreshold. """Thresholds an image using OpenCV's adaptiveThreshold.
@ -102,6 +100,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
_, contours, _ = cv2.findContours( _, contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError: except ValueError:
# for opencv backward compatibility
contours, _ = cv2.findContours( contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
@ -141,6 +140,7 @@ def find_table_contours(vertical, horizontal):
__, contours, __ = cv2.findContours( __, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError: except ValueError:
# for opencv backward compatibility
contours, __ = cv2.findContours( contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
@ -185,6 +185,7 @@ def find_table_joints(contours, vertical, horizontal):
__, jc, __ = cv2.findContours( __, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
except ValueError: except ValueError:
# for opencv backward compatibility
jc, __ = cv2.findContours( jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than 4 joints if len(jc) <= 4: # remove contours with less than 4 joints
@ -197,79 +198,3 @@ def find_table_joints(contours, vertical, horizontal):
tables[(x, y + h, x + w, y)] = joint_coords tables[(x, y + h, x + w, y)] = joint_coords
return tables return tables
def remove_lines(threshold, line_size_scaling=15):
"""Removes lines from a thresholded image.
Parameters
----------
threshold : object
numpy.ndarray representing the thresholded image.
line_size_scaling : int, optional (default: 15)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
Returns
-------
threshold : object
numpy.ndarray representing the thresholded image
with horizontal and vertical lines removed.
"""
size = threshold.shape[0] // line_size_scaling
vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
dilate_el = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
vertical = cv2.erode(threshold, vertical_erode_el)
vertical = cv2.dilate(vertical, dilate_el)
horizontal = cv2.erode(threshold, horizontal_erode_el)
horizontal = cv2.dilate(horizontal, dilate_el)
threshold = np.bitwise_and(threshold, np.invert(vertical))
threshold = np.bitwise_and(threshold, np.invert(horizontal))
return threshold
def find_cuts(threshold, char_size_scaling=200):
"""Finds cuts made by text projections on y-axis.
Parameters
----------
threshold : object
numpy.ndarray representing the thresholded image.
line_size_scaling : int, optional (default: 200)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
Returns
-------
y_cuts : list
List of cuts on y-axis.
"""
size = threshold.shape[0] // char_size_scaling
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
threshold = cv2.erode(threshold, char_el)
threshold = cv2.dilate(threshold, char_el)
try:
__, contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
contours = [cv2.boundingRect(c) for c in contours]
y_cuts = [(c[1], c[1] + c[3]) for c in contours]
y_cuts = list(merge_tuples(sorted(y_cuts)))
y_cuts = [(y_cuts[i][0] + y_cuts[i - 1][1]) // 2 for i in range(1, len(y_cuts))]
return sorted(y_cuts, reverse=True)

View File

@ -641,24 +641,3 @@ def get_text_objects(layout, ltype="char", t=None):
except AttributeError: except AttributeError:
pass pass
return t return t
def merge_tuples(tuples):
"""Merges a list of overlapping tuples.
Parameters
----------
tuples : list
List of tuples where a tuple is a single axis coordinate pair.
Yields
------
tuple
"""
merged = list(tuples[0])
for s, e in tuples:
if s <= merged[1]:
merged[1] = max(merged[1], e)
else:
yield tuple(merged)
merged[0] = s
merged[1] = e
yield tuple(merged)

View File

@ -2,6 +2,7 @@ click==6.7
matplotlib==2.2.3 matplotlib==2.2.3
numpy==1.15.2 numpy==1.15.2
opencv-python==3.4.2.17 opencv-python==3.4.2.17
openpyxl==2.5.8
pandas==0.23.4 pandas==0.23.4
pdfminer.six==20170720 pdfminer.six==20170720
PyPDF2==1.26.0 PyPDF2==1.26.0

View File

@ -3,6 +3,40 @@
from __future__ import unicode_literals from __future__ import unicode_literals
data_stream = [
["", "Table: 5 Public Health Outlay 2012-13 (Budget Estimates) (Rs. in 000)", "", "", "", "", "", ""],
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
["", "", "", "", "", "Revenue &", "", ""],
["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
["", "", "", "", "", "Capital", "", ""],
["", "Public", "Welfare", "Public", "Welfare", "", "", ""],
["", "Health", "", "Health", "", "", "", ""],
["Andhra Pradesh", "47,824,589", "9,967,837", "1,275,000", "15,000", "59,082,426", "14,898,243", "73,980,669"],
["Arunachal Pradesh", "2,241,609", "107,549", "23,000", "0", "2,372,158", "86,336", "2,458,494"],
["Assam", "14,874,821", "2,554,197", "161,600", "0", "17,590,618", "4,408,505", "21,999,123"],
["Bihar", "21,016,708", "4,332,141", "5,329,000", "0", "30,677,849", "2,251,571", "32,929,420"],
["Chhattisgarh", "11,427,311", "1,415,660", "2,366,592", "0", "15,209,563", "311,163", "15,520,726"],
["Delhi", "28,084,780", "411,700", "4,550,000", "0", "33,046,480", "5,000", "33,051,480"],
["Goa", "4,055,567", "110,000", "330,053", "0", "4,495,620", "12,560", "4,508,180"],
["Gujarat", "26,328,400", "6,922,900", "12,664,000", "42,000", "45,957,300", "455,860", "46,413,160"],
["Haryana", "15,156,681", "1,333,527", "40,100", "0", "16,530,308", "1,222,698", "17,753,006"],
["Himachal Pradesh", "8,647,229", "1,331,529", "580,800", "0", "10,559,558", "725,315", "11,284,873"],
["Jammu & Kashmir", "14,411,984", "270,840", "3,188,550", "0", "17,871,374", "166,229", "18,037,603"],
["Jharkhand", "8,185,079", "3,008,077", "3,525,558", "0", "14,718,714", "745,139", "15,463,853"],
["Karnataka", "34,939,843", "4,317,801", "3,669,700", "0", "42,927,344", "631,088", "43,558,432"],
["Kerala", "27,923,965", "3,985,473", "929,503", "0", "32,838,941", "334,640", "33,173,581"],
["Madhya Pradesh", "28,459,540", "4,072,016", "3,432,711", "0", "35,964,267", "472,139", "36,436,406"],
["Maharashtra", "55,011,100", "6,680,721", "5,038,576", "0", "66,730,397", "313,762", "67,044,159"],
["Manipur", "2,494,600", "187,700", "897,400", "0", "3,579,700", "0", "3,579,700"],
["Meghalaya", "2,894,093", "342,893", "705,500", "5,000", "3,947,486", "24,128", "3,971,614"],
["Mizoram", "1,743,501", "84,185", "10,250", "0", "1,837,936", "17,060", "1,854,996"],
["Nagaland", "2,368,724", "204,329", "226,400", "0", "2,799,453", "783,054", "3,582,507"],
["Odisha", "14,317,179", "2,552,292", "1,107,250", "0", "17,976,721", "451,438", "18,428,159"],
["Puducherry", "4,191,757", "52,249", "192,400", "0", "4,436,406", "2,173", "4,438,579"],
["Punjab", "19,775,485", "2,208,343", "2,470,882", "0", "24,454,710", "1,436,522", "25,891,232"],
["", "Health Sector Financing by Centre and States/UTs in India [2009-10 to 2012-13](Revised) P a g e |23", "", "", "", "", "", ""]
]
data_stream_table_rotated = [ data_stream_table_rotated = [
["", "", "Table 21 Current use of contraception by background characteristics\u2014Continued", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], ["", "", "Table 21 Current use of contraception by background characteristics\u2014Continued", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "", "", "", "", "", "Modern method", "", "", "", "", "", "", "Traditional method", "", "", "", ""], ["", "", "", "", "", "", "Modern method", "", "", "", "", "", "", "Traditional method", "", "", "", ""],
@ -47,7 +81,7 @@ data_stream_table_rotated = [
["", "", "", "", "", "", "", "", "54", "", "", "", "", "", "", "", "", ""] ["", "", "", "", "", "", "", "", "54", "", "", "", "", "", "", "", "", ""]
] ]
data_stream_table_area_single = [ data_stream_table_area = [
["", "One Withholding"], ["", "One Withholding"],
["Payroll Period", "Allowance"], ["Payroll Period", "Allowance"],
["Weekly", "$71.15"], ["Weekly", "$71.15"],
@ -107,6 +141,100 @@ data_stream_columns = [
["01", "Aguascalientes", "001", "Aguascalientes", "0226", "Hacienda Nueva"] ["01", "Aguascalientes", "001", "Aguascalientes", "0226", "Hacienda Nueva"]
] ]
data_stream_split_text = [
["FEB", "RUAR", "Y 2014 M27 (BUS)", "", "ALPHABETIC LISTING BY T", "YPE", "", "", "", "ABLPDM27"],
["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"],
["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""],
["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""],
["NUMBER", "TYPE", "DBA NAME", "LICENSEE NAME", "ADDRESS", "CITY", "ST", "ZIP", "PHONE NUMBER", "EXPIRES"],
["648765", "AAA", "ALLEGIANT AIR", "ALLEGIANT AIR LLC", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "-", "2014/12/03"],
["", "", "", "", "7777 EAST APACHE", "", "", "", "", ""],
["648766", "AAA", "ALLEGIANT AIR", "ALLEGIANT AIR LLC", "STREET", "TULSA", "OK", "74115", "-", "2014/12/16"],
["82030", "AAA", "AMERICAN AIRLINES", "AMERICAN AIRLINES INC", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "(405) 680-3701", "2014/09/14"],
["509462", "AAA", "AMERICAN AIRLINES", "AMERICAN AIRLINES INC", "7777 EAST APACHE DRIVE", "TULSA", "OK", "74115", "(918) 831-6302", "2014/08/19"],
["", "", "", "AMERICAN EAGLE", "", "", "", "", "", ""],
["509609", "AAA", "AMERICAN EAGLE", "AIRLINES INC", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "(405) 680-3701", "2014/08/19"],
["", "", "", "AMERICAN EAGLE", "", "", "", "", "", ""],
["402986", "AAA", "AMERICAN EAGLE", "AIRLINES INC", "7777 EAST APACHE DRIVE", "TULSA", "OK", "74115", "(859) 767-3747", "2014/10/22"],
["", "", "", "", "WILL ROGERS AIRPORT", "", "", "", "", ""],
["79145", "AAA", "DELTA AIR LINES", "DELTA AIR LINES INC", "BOX 59975", "OKLAHOMA CITY", "OK", "73159", "(404) 773-9745", "2014/05/11"],
["600941", "AAA", "ENDEAVOR AIR", "ENDEAVOR AIR INC", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "(901) 348-4100", "2015/03/26"],
["", "", "", "", "7100 TERMINAL DRIVE", "", "", "", "", ""],
["478482", "AAA", "EXPRESSJET AIRLINES", "EXPRESSJET AIRLINES INC", "WILL ROGERS AIRPORT", "OKLAHOMA CITY", "OK", "73159", "(832) 353-1201", "2014/05/08"],
["505981", "AAA", "SKYWEST AIRLINES", "SKYWEST INC", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "(405) 634-3000", "2014/05/28"],
["429754", "AAA", "SOUTHWEST AIRLINES", "SOUTHWEST AIRLINES CO", "7100 TERMINAL DRIVE", "OKLAHOMA CITY", "OK", "73159", "(405) 682-4183", "2015/02/15"],
["", "", "TULSA INTERNATIONAL", "", "", "", "", "", "", ""],
["429755", "AAA", "AIRPORT", "SOUTHWEST AIRLINES CO", "7777 EAST APACHE DRIVE", "TULSA", "OK", "74115", "(918) 834-4495", "2015/02/16"],
["415051", "AAA", "UNITED AIRLINES", "UNITED AIRLINES INC", "7777 EAST APACHE DRIVE", "TULSA", "OK", "74115", "(872) 825-8309", "2014/05/12"],
["106719", "AAA", "UNITED AIRLINES", "UNITED AIRLINES INC", "WILL ROGERS AIRPORT", "OKLAHOMA CITY", "OK", "73159", "(872) 825-8309", "2014/04/11"],
["", "", "A SENSU JAPANESE", "", "7123 SOUTH 92ND EAST", "", "", "", "", ""],
["625422", "BAW", "RESTAURANT", "INFORMAL PARTNERSHIP", "AVENUE SUITE J", "TULSA", "OK", "74133", "(918) 252-0333", "2015/02/14"],
["", "", "ADAMO'S ROUTE 66", "", "2132 WEST GARY", "", "", "", "", ""],
["464828", "BAW", "ITALIAN VILLA", "TADJ INC", "BOULEVARD", "CLINTON", "OK", "73601", "(580) 323-5900", "2015/02/11"],
["", "", "", "", "12215 NORTH", "", "", "", "", ""],
["184066", "BAW", "AJANTA", "CABAB N' CURRY INC", "PENNSYLVANIA", "OKLAHOMA CITY", "OK", "73120", "(405) 752-5283", "2014/07/27"],
["", "", "", "SAYRE LODGING", "", "", "", "", "", ""],
["547693", "BAW", "AMERICINN OF SAYRE", "ENTERPRISES LLC", "2405 SOUTH EL CAMINO", "SAYRE", "OK", "73662", "(580) 928-2700", "2014/09/08"],
["", "", "ANDOLINI'S PIZZERIA &", "", "12140 EAST 96TH STREET", "", "", "", "", ""],
["428377", "BAW", "ITALIAN RESTAURANT", "ANDOLINI'S LLC", "NORTH #106", "OWASSO", "OK", "74055", "(918) 272-9325", "2015/02/10"],
["", "", "ASAHI JAPANESE", "", "", "", "", "", "", ""],
["446957", "BAW", "RESTAURANT", "JIN CORPORATION", "7831 EAST 71ST STREET", "TULSA", "OK", "74133", "(918) 307-9151", "2014/12/22"],
["", "", "", "SMOKEHOUSE", "", "", "", "", "", ""],
["632501", "BAW", "BACK DOOR BARBECUE", "ASSOCIATES INC", "315 NORTHWEST 23RD", "OKLAHOMA CITY", "OK", "73103", "-", "2014/08/01"],
["598515", "BAW", "BAMBOO THAI BISTRO", "BAMBOO THAI BISTRO INC", "5079 SOUTH YALE AVENUE", "TULSA", "OK", "74135", "(918) 828-0740", "2015/03/11"],
["", "", "BANDANA RED'S", "", "", "", "", "", "", ""],
["618693", "BAW", "STEAKHOUSE", "BRADSHAW, STEVE_LEN", "37808 OLD HIGHWAY 270", "SHAWNEE", "OK", "74804", "-", "2014/08/20"],
["", "", "", "", "1522 WEST LINDSEY", "", "", "", "", ""],
["632575", "BAW", "BASHU LEGENDS", "HYH HE CHUANG LLC", "STREET", "NORMAN", "OK", "73069", "-", "2014/07/21"],
["", "", "", "DEEP FORK HOLDINGS", "", "", "", "", "", ""],
["543149", "BAW", "BEDLAM BAR-B-Q", "LLC", "610 NORTHEAST 50TH", "OKLAHOMA CITY", "OK", "73105", "(405) 528-7427", "2015/02/23"],
["", "", "", "", "Page 1 of 151", "", "", "", "", ""]
]
data_stream_flag_size = [
["", "TABLE 125: STATE-WISE COMPOSITION OF OUTSTANDING LIABILITIES - 1997 <s>(Contd.)</s>", "", "", "", "", "", "", "", "", ""],
["", "", "", "", "(As at end-March)", "", "", "", "", "", ""],
["", "", "", "", "", "", "", "", "", "", "(<s>`</s> Billion)"],
["States", "Total", "Market", "NSSF", "WMA", "Loans", "Loans", "Loans", "Loans", "Loans", "Loans"],
["", "Internal", "Loans", "", "from", "from", "from", "from", "from", "from SBI", "from"],
["", "Debt", "", "", "RBI", "Banks", "LIC", "GIC", "NABARD", "& Other", "NCDC"],
["", "", "", "", "", "& FIs", "", "", "", "Banks", ""],
["1", "2=", "3", "4", "5", "6=", "7", "8", "9", "10", "11"],
["", "(3 to 6)+14", "", "", "", "(7 to13)", "", "", "", "", ""],
["Andhra Pradesh", "48.11", "40.45", "-", "3.26", "4.4", "2.62", "-", "0.91", "-", "0.25"],
["Arunachal Pradesh", "1.23", "1.1", "-", "-", "0.13", "-", "-", "-", "-", "-"],
["Assam", "12.69", "10.02", "-", "2.41", "0.26", "0.08", "-", "-0.06", "0.01", "0.24"],
["Bihar", "40.75", "41.54", "-", "-", "-1.42", "0.19", "-", "-1.01", "-0.36", "0.2"],
["Chhattisgarh", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"],
["Goa", "1.4", "1.02", "-", "-", "0.38", "0.31", "-", "0.07", "-", "-"],
["Gujarat", "19.75", "17.1", "-", "-", "2.64", "1.17", "-", "1.11", "-", "0.44"],
["Haryana", "11.53", "9.67", "-", "0.06", "1.8", "0.55", "-", "0.64", "-", "0.49"],
["Himachal Pradesh", "8.02", "2.94", "-", "4.55", "0.53", "0.13", "-", "0.05", "-", "0.25"],
["Jammu and Kashmir", "11.72", "4.49", "-", "-", "7.23", "0.66", "-", "0.02", "6.08", "-"],
["Jharkhand", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"],
["Karnataka", "22.44", "19.59", "-", "-", "2.86", "1.22", "-", "0.89", "-", "0.69"],
["Kerala", "29.03", "24.91<s>2</s>", "-", "-", "4.11", "1.77", "-", "0.48", "-", "1.45"],
["Madhya Pradesh", "27.13", "23.57", "-", "-", "3.56", "0.38", "-", "1.86", "-", "1.28"],
["Maharashtra", "30.47", "26.07", "-", "-", "4.39", "0.21", "-", "-0.12", "0.02", "2.89"],
["Manipur", "2.17", "1.61", "-", "0.26", "0.29", "0.08", "-", "-", "-", "0.09"],
["Meghalaya", "1.36", "1.38", "-", "-", "-0.02", "0.04", "-", "-0.05", "-", "0.03"],
["Mizoram", "1.17", "0.46", "-", "0.27", "0.43", "0.11", "-", "-", "-", "0.03"],
["Nagaland", "2.99", "2.6", "-", "-", "0.39", "0.24", "-", "-", "-", "0.04"],
["Odisha", "34.04", "27.58", "-", "4.4", "2.06", "0.56", "-", "0.66", "-", "0.2"],
["Punjab", "19.18", "10.93", "-", "1.03", "7.23", "0.17", "-", "0.71", "5.9", "0.46"],
["Rajasthan", "36.77", "28.63", "-", "4.99", "3.16", "0.57", "-", "1.64", "-", "0.81"],
["Sikkim", "0.16", "-", "-", "-", "0.16", "0.03", "-", "-", "-", "0.01"],
["Tamil Nadu", "34.11", "31.41", "-", "-", "2.7", "1.3", "-", "0.6", "-", "0.68"],
["Tripura", "2.3", "1.89", "-", "-", "0.41", "0.41", "-", "-0.05", "-", "0.02"],
["Uttaranchal", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"],
["Uttar Pradesh", "80.62", "74.89", "-", "4.34", "1.34", "0.6", "-", "-0.21", "0.18", "0.03"],
["West Bengal", "34.23", "32.19", "-", "-", "2.04", "0.77", "-", "0.06", "-", "0.51"],
["NCT Delhi", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"],
["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"],
["<s>2</s> Includes `2.45 crore outstanding under “Market Loan Suspense”.", "", "", "", "", "", "", "", "", "", ""],
["", "", "", "", "445", "", "", "", "", "", ""]
]
data_lattice = [ data_lattice = [
["Cycle Name", "KI (1/km)", "Distance (mi)", "Percent Fuel Savings", "", "", ""], ["Cycle Name", "KI (1/km)", "Distance (mi)", "Percent Fuel Savings", "", "", ""],
["", "", "", "Improved Speed", "Decreased Accel", "Eliminate Stops", "Decreased Idle"], ["", "", "", "Improved Speed", "Decreased Accel", "Eliminate Stops", "Decreased Idle"],
@ -133,6 +261,24 @@ data_lattice_table_rotated = [
["Pooled", "38742", "53618", "60601", "86898", "4459", "21918", "27041", "14312", "18519"] ["Pooled", "38742", "53618", "60601", "86898", "4459", "21918", "27041", "14312", "18519"]
] ]
data_lattice_table_area = [
["", "", "", "", "", "", "", "", ""],
["State", "n", "Literacy Status", "", "", "", "", "", ""],
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College", ""],
["Kerala", "2400", "7.2", "0.5", "25.3", "20.1", "41.5", "5.5", ""],
["Tamil Nadu", "2400", "21.4", "2.3", "8.8", "35.5", "25.8", "6.2", ""],
["Karnataka", "2399", "37.4", "2.8", "12.5", "18.3", "23.1", "5.8", ""],
["Andhra Pradesh", "2400", "54.0", "1.7", "8.4", "13.2", "18.8", "3.9", ""],
["Maharashtra", "2400", "22.0", "0.9", "17.3", "20.3", "32.6", "7.0", ""],
["Gujarat", "2390", "28.6", "0.1", "14.4", "23.1", "26.9", "6.8", ""],
["Madhya Pradesh", "2402", "29.1", "3.4", "8.5", "35.1", "13.3", "10.6", ""],
["Orissa", "2405", "33.2", "1.0", "10.4", "25.7", "21.2", "8.5", ""],
["West Bengal", "2293", "41.7", "4.4", "13.2", "17.1", "21.2", "2.4", ""],
["Uttar Pradesh", "2400", "35.3", "2.1", "4.5", "23.3", "27.1", "7.6", ""],
["Pooled", "23889", "30.9", "1.9", "12.3", "23.2", "25.2", "6.4", ""],
["", "", "", "", "", "", "", "", ""]
]
data_lattice_process_background = [ data_lattice_process_background = [
["State", "Date", "Halt stations", "Halt days", "Persons directly reached(in lakh)", "Persons trained", "Persons counseled" ,"Persons testedfor HIV"], ["State", "Date", "Halt stations", "Halt days", "Persons directly reached(in lakh)", "Persons trained", "Persons counseled" ,"Persons testedfor HIV"],
["Delhi", "1.12.2009", "8", "17", "1.29", "3,665", "2,409", "1,000"], ["Delhi", "1.12.2009", "8", "17", "1.29", "3,665", "2,409", "1,000"],
@ -186,3 +332,45 @@ data_lattice_copy_text = [
["All Models Total Enrollments", "", "", "10,132,875"], ["All Models Total Enrollments", "", "", "10,132,875"],
["Source: Data Warehouse 12/14/15", "", "", ""] ["Source: Data Warehouse 12/14/15", "", "", ""]
] ]
data_lattice_shift_text_left_top = [
["Investigations", "No. ofHHs", "Age/Sex/Physiological Group", "Preva-lence", "C.I*", "RelativePrecision", "Sample sizeper State"],
["Anthropometry", "2400", "All the available individuals", "", "", "", ""],
["Clinical Examination", "", "", "", "", "", ""],
["History of morbidity", "", "", "", "", "", ""],
["Diet survey", "1200", "All the individuals partaking meals in the HH", "", "", "", ""],
["Blood Pressure #", "2400", "Men (≥ 18yrs)", "10%", "95%", "20%", "1728"],
["", "", "Women (≥ 18 yrs)", "", "", "", "1728"],
["Fasting blood glucose", "2400", "Men (≥ 18 yrs)", "5%", "95%", "20%", "1825"],
["", "", "Women (≥ 18 yrs)", "", "", "", "1825"],
["Knowledge &Practices on HTN &DM", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"],
["", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"]
]
data_lattice_shift_text_disable = [
["Investigations", "No. ofHHs", "Age/Sex/Physiological Group", "Preva-lence", "C.I*", "RelativePrecision", "Sample sizeper State"],
["Anthropometry", "", "", "", "", "", ""],
["Clinical Examination", "2400", "", "All the available individuals", "", "", ""],
["History of morbidity", "", "", "", "", "", ""],
["Diet survey", "1200", "", "All the individuals partaking meals in the HH", "", "", ""],
["", "", "Men (≥ 18yrs)", "", "", "", "1728"],
["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%", "1728"],
["", "", "Men (≥ 18 yrs)", "", "", "", "1825"],
["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"],
["Knowledge &Practices on HTN &", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"],
["DM", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"]
]
data_lattice_shift_text_right_bottom = [
["Investigations", "No. ofHHs", "Age/Sex/Physiological Group", "Preva-lence", "C.I*", "RelativePrecision", "Sample sizeper State"],
["Anthropometry", "", "", "", "", "", ""],
["Clinical Examination", "", "", "", "", "", ""],
["History of morbidity", "2400", "", "", "", "", "All the available individuals"],
["Diet survey", "1200", "", "", "", "", "All the individuals partaking meals in the HH"],
["", "", "Men (≥ 18yrs)", "", "", "", "1728"],
["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%", "1728"],
["", "", "Men (≥ 18 yrs)", "", "", "", "1825"],
["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"],
["", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"],
["Knowledge &Practices on HTN &DM", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"]
]

Binary file not shown.

View File

@ -50,3 +50,30 @@ def test_cli_stream():
result = runner.invoke(cli, ['--output', outfile, 'stream', infile]) result = runner.invoke(cli, ['--output', outfile, 'stream', infile])
format_error = 'Please specify output file format using --format' format_error = 'Please specify output file format using --format'
assert format_error in result.output assert format_error in result.output
def test_cli_output_format():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, 'health.pdf')
outfile = os.path.join(tempdir, 'health.{}')
runner = CliRunner()
# json
result = runner.invoke(cli, ['--format', 'json', '--output', outfile.format('json'),
'stream', infile])
assert result.exit_code == 0
# excel
result = runner.invoke(cli, ['--format', 'excel', '--output', outfile.format('xlsx'),
'stream', infile])
assert result.exit_code == 0
# html
result = runner.invoke(cli, ['--format', 'html', '--output', outfile.format('html'),
'stream', infile])
assert result.exit_code == 0
# zip
result = runner.invoke(cli, ['--zip', '--format', 'csv', '--output', outfile.format('csv'),
'stream', infile])
assert result.exit_code == 0

View File

@ -12,8 +12,25 @@ testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files") testdir = os.path.join(testdir, "files")
def test_parsing_report():
parsing_report = {
'accuracy': 99.02,
'whitespace': 12.24,
'order': 1,
'page': 1
}
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
assert tables[0].parsing_report == parsing_report
def test_stream(): def test_stream():
pass df = pd.DataFrame(data_stream)
filename = os.path.join(testdir, "health.pdf")
tables = camelot.read_pdf(filename, flavor="stream")
assert df.equals(tables[0].df)
def test_stream_table_rotated(): def test_stream_table_rotated():
@ -29,7 +46,7 @@ def test_stream_table_rotated():
def test_stream_table_area(): def test_stream_table_area():
df = pd.DataFrame(data_stream_table_area_single) df = pd.DataFrame(data_stream_table_area)
filename = os.path.join(testdir, "tabula/us-007.pdf") filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(filename, flavor="stream", table_area=["320,500,573,335"]) tables = camelot.read_pdf(filename, flavor="stream", table_area=["320,500,573,335"])
@ -45,6 +62,23 @@ def test_stream_columns():
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
def test_stream_split_text():
df = pd.DataFrame(data_stream_split_text)
filename = os.path.join(testdir, "tabula/m27.pdf")
tables = camelot.read_pdf(
filename, flavor="stream", columns=["72,95,209,327,442,529,566,606,683"], split_text=True)
assert df.equals(tables[0].df)
def test_stream_flag_size():
df = pd.DataFrame(data_stream_flag_size)
filename = os.path.join(testdir, "superscript.pdf")
tables = camelot.read_pdf(filename, flavor="stream", flag_size=True)
assert df.equals(tables[0].df)
def test_lattice(): def test_lattice():
df = pd.DataFrame(data_lattice) df = pd.DataFrame(data_lattice)
@ -66,6 +100,14 @@ def test_lattice_table_rotated():
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
def test_lattice_table_area():
df = pd.DataFrame(data_lattice_table_area)
filename = os.path.join(testdir, "twotables_2.pdf")
tables = camelot.read_pdf(filename, table_area=["80,693,535,448"])
assert df.equals(tables[0].df)
def test_lattice_process_background(): def test_lattice_process_background():
df = pd.DataFrame(data_lattice_process_background) df = pd.DataFrame(data_lattice_process_background)
@ -80,3 +122,19 @@ def test_lattice_copy_text():
filename = os.path.join(testdir, "row_span_1.pdf") filename = os.path.join(testdir, "row_span_1.pdf")
tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v") tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
def test_lattice_shift_text():
df_lt = pd.DataFrame(data_lattice_shift_text_left_top)
df_disable = pd.DataFrame(data_lattice_shift_text_disable)
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
filename = os.path.join(testdir, "column_span_2.pdf")
tables = camelot.read_pdf(filename, line_size_scaling=40)
assert df_lt.equals(tables[0].df)
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=[''])
assert df_disable.equals(tables[0].df)
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b'])
assert df_rb.equals(tables[0].df)

View File

@ -1 +0,0 @@
# -*- coding: utf-8 -*-