Fix read_pdf(url) and test data
|
|
@ -29,16 +29,9 @@ from pdfminer.layout import (
|
||||||
LTImage,
|
LTImage,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from urllib.request import Request, urlopen
|
||||||
PY3 = sys.version_info[0] >= 3
|
from urllib.parse import urlparse as parse_url
|
||||||
if PY3:
|
from urllib.parse import uses_relative, uses_netloc, uses_params
|
||||||
from urllib.request import urlopen
|
|
||||||
from urllib.parse import urlparse as parse_url
|
|
||||||
from urllib.parse import uses_relative, uses_netloc, uses_params
|
|
||||||
else:
|
|
||||||
from urllib2 import urlopen
|
|
||||||
from urlparse import urlparse as parse_url
|
|
||||||
from urlparse import uses_relative, uses_netloc, uses_params
|
|
||||||
|
|
||||||
|
|
||||||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||||
|
|
@ -90,11 +83,10 @@ def download_url(url):
|
||||||
"""
|
"""
|
||||||
filename = "{}.pdf".format(random_string(6))
|
filename = "{}.pdf".format(random_string(6))
|
||||||
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
||||||
obj = urlopen(url)
|
headers = {"User-Agent": "Mozilla/5.0"}
|
||||||
if PY3:
|
request = Request(url, None, headers)
|
||||||
|
obj = urlopen(request)
|
||||||
content_type = obj.info().get_content_type()
|
content_type = obj.info().get_content_type()
|
||||||
else:
|
|
||||||
content_type = obj.info().getheader("Content-Type")
|
|
||||||
if content_type != "application/pdf":
|
if content_type != "application/pdf":
|
||||||
raise NotImplementedError("File format not supported")
|
raise NotImplementedError("File format not supported")
|
||||||
f.write(obj.read())
|
f.write(obj.read())
|
||||||
|
|
|
||||||
|
|
@ -4,16 +4,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
data_stream = [
|
data_stream = [
|
||||||
[
|
|
||||||
"",
|
|
||||||
"Table: 5 Public Health Outlay 2012-13 (Budget Estimates) (Rs. in 000)",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
],
|
|
||||||
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
|
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
|
||||||
["", "", "", "", "", "Revenue &", "", ""],
|
["", "", "", "", "", "Revenue &", "", ""],
|
||||||
["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
|
["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
|
||||||
|
|
@ -829,18 +819,6 @@ data_stream_table_rotated = [
|
||||||
]
|
]
|
||||||
|
|
||||||
data_stream_two_tables_1 = [
|
data_stream_two_tables_1 = [
|
||||||
[
|
|
||||||
"[In thousands (11,062.6 represents 11,062,600) For year ending December 31. Based on Uniform Crime Reporting (UCR)",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
],
|
|
||||||
[
|
[
|
||||||
"Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated",
|
"Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated",
|
||||||
"",
|
"",
|
||||||
|
|
@ -1300,29 +1278,10 @@ data_stream_two_tables_1 = [
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
],
|
],
|
||||||
[
|
|
||||||
"",
|
|
||||||
"Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
],
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
data_stream_two_tables_2 = [
|
data_stream_two_tables_2 = [
|
||||||
[
|
|
||||||
"",
|
|
||||||
"Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
],
|
|
||||||
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
|
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
|
||||||
[
|
[
|
||||||
"[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies",
|
"[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies",
|
||||||
|
|
@ -1600,16 +1559,9 @@ data_stream_two_tables_2 = [
|
||||||
"3,950",
|
"3,950",
|
||||||
],
|
],
|
||||||
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
||||||
[
|
|
||||||
"",
|
|
||||||
"Source: U.S. Department of Justice, Federal Bureau of Investigation, “Crime in the United States, Arrests,” September 2010,",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
],
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
data_stream_table_areas = [
|
data_stream_table_areas = [
|
||||||
["", "One Withholding"],
|
["", "One Withholding"],
|
||||||
["Payroll Period", "Allowance"],
|
["Payroll Period", "Allowance"],
|
||||||
|
|
@ -1776,18 +1728,7 @@ data_stream_columns = [
|
||||||
]
|
]
|
||||||
|
|
||||||
data_stream_split_text = [
|
data_stream_split_text = [
|
||||||
[
|
["FEB", "RUAR", "Y 2014 M27 (BUS)", "", "", "", "", "", "", ""],
|
||||||
"FEB",
|
|
||||||
"RUAR",
|
|
||||||
"Y 2014 M27 (BUS)",
|
|
||||||
"",
|
|
||||||
"ALPHABETIC LISTING BY T",
|
|
||||||
"YPE",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"ABLPDM27",
|
|
||||||
],
|
|
||||||
["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"],
|
["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"],
|
||||||
["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""],
|
["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""],
|
||||||
["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""],
|
["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""],
|
||||||
|
|
@ -2121,6 +2062,7 @@ data_stream_split_text = [
|
||||||
],
|
],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
data_stream_flag_size = [
|
data_stream_flag_size = [
|
||||||
[
|
[
|
||||||
"States",
|
"States",
|
||||||
|
|
@ -2820,7 +2762,7 @@ data_arabic = [
|
||||||
]
|
]
|
||||||
|
|
||||||
data_stream_layout_kwargs = [
|
data_stream_layout_kwargs = [
|
||||||
["V i n s a u Ve r r e", ""],
|
["V i n s a u V e r r e", ""],
|
||||||
["Les Blancs", "12.5CL"],
|
["Les Blancs", "12.5CL"],
|
||||||
["A.O.P Côtes du Rhône", ""],
|
["A.O.P Côtes du Rhône", ""],
|
||||||
["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],
|
["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],
|
||||||
|
|
|
||||||
|
Before Width: | Height: | Size: 48 KiB After Width: | Height: | Size: 33 KiB |
|
Before Width: | Height: | Size: 6.7 KiB After Width: | Height: | Size: 6.7 KiB |
|
Before Width: | Height: | Size: 13 KiB After Width: | Height: | Size: 14 KiB |
|
Before Width: | Height: | Size: 8.8 KiB After Width: | Height: | Size: 8.9 KiB |
|
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 19 KiB |