Fix read_pdf(url) and test data
|
|
@ -29,16 +29,9 @@ from pdfminer.layout import (
|
|||
LTImage,
|
||||
)
|
||||
|
||||
|
||||
PY3 = sys.version_info[0] >= 3
|
||||
if PY3:
|
||||
from urllib.request import urlopen
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.parse import urlparse as parse_url
|
||||
from urllib.parse import uses_relative, uses_netloc, uses_params
|
||||
else:
|
||||
from urllib2 import urlopen
|
||||
from urlparse import urlparse as parse_url
|
||||
from urlparse import uses_relative, uses_netloc, uses_params
|
||||
|
||||
|
||||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||
|
|
@ -90,11 +83,10 @@ def download_url(url):
|
|||
"""
|
||||
filename = "{}.pdf".format(random_string(6))
|
||||
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
||||
obj = urlopen(url)
|
||||
if PY3:
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
request = Request(url, None, headers)
|
||||
obj = urlopen(request)
|
||||
content_type = obj.info().get_content_type()
|
||||
else:
|
||||
content_type = obj.info().getheader("Content-Type")
|
||||
if content_type != "application/pdf":
|
||||
raise NotImplementedError("File format not supported")
|
||||
f.write(obj.read())
|
||||
|
|
|
|||
|
|
@ -4,16 +4,6 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
data_stream = [
|
||||
[
|
||||
"",
|
||||
"Table: 5 Public Health Outlay 2012-13 (Budget Estimates) (Rs. in 000)",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
|
||||
["", "", "", "", "", "Revenue &", "", ""],
|
||||
["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
|
||||
|
|
@ -829,18 +819,6 @@ data_stream_table_rotated = [
|
|||
]
|
||||
|
||||
data_stream_two_tables_1 = [
|
||||
[
|
||||
"[In thousands (11,062.6 represents 11,062,600) For year ending December 31. Based on Uniform Crime Reporting (UCR)",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated",
|
||||
"",
|
||||
|
|
@ -1300,29 +1278,10 @@ data_stream_two_tables_1 = [
|
|||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"",
|
||||
"Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
]
|
||||
|
||||
|
||||
data_stream_two_tables_2 = [
|
||||
[
|
||||
"",
|
||||
"Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
|
||||
[
|
||||
"[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies",
|
||||
|
|
@ -1600,16 +1559,9 @@ data_stream_two_tables_2 = [
|
|||
"3,950",
|
||||
],
|
||||
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
||||
[
|
||||
"",
|
||||
"Source: U.S. Department of Justice, Federal Bureau of Investigation, “Crime in the United States, Arrests,” September 2010,",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
]
|
||||
|
||||
|
||||
data_stream_table_areas = [
|
||||
["", "One Withholding"],
|
||||
["Payroll Period", "Allowance"],
|
||||
|
|
@ -1776,18 +1728,7 @@ data_stream_columns = [
|
|||
]
|
||||
|
||||
data_stream_split_text = [
|
||||
[
|
||||
"FEB",
|
||||
"RUAR",
|
||||
"Y 2014 M27 (BUS)",
|
||||
"",
|
||||
"ALPHABETIC LISTING BY T",
|
||||
"YPE",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"ABLPDM27",
|
||||
],
|
||||
["FEB", "RUAR", "Y 2014 M27 (BUS)", "", "", "", "", "", "", ""],
|
||||
["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"],
|
||||
["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""],
|
||||
["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""],
|
||||
|
|
@ -2121,6 +2062,7 @@ data_stream_split_text = [
|
|||
],
|
||||
]
|
||||
|
||||
|
||||
data_stream_flag_size = [
|
||||
[
|
||||
"States",
|
||||
|
|
|
|||
|
Before Width: | Height: | Size: 48 KiB After Width: | Height: | Size: 33 KiB |
|
Before Width: | Height: | Size: 6.7 KiB After Width: | Height: | Size: 6.7 KiB |
|
Before Width: | Height: | Size: 13 KiB After Width: | Height: | Size: 14 KiB |
|
Before Width: | Height: | Size: 8.8 KiB After Width: | Height: | Size: 8.9 KiB |
|
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 19 KiB |