Fix read_pdf(url) and test data

pull/146/head
Vinayak Mehta 2020-05-24 17:26:52 +05:30
parent fc1b6f6227
commit 3afb72b872
No known key found for this signature in database
GPG Key ID: 2170CDB940114C1D
7 changed files with 14 additions and 80 deletions

View File

@ -29,16 +29,9 @@ from pdfminer.layout import (
LTImage, LTImage,
) )
from urllib.request import Request, urlopen
PY3 = sys.version_info[0] >= 3 from urllib.parse import urlparse as parse_url
if PY3: from urllib.parse import uses_relative, uses_netloc, uses_params
from urllib.request import urlopen
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_relative, uses_netloc, uses_params
else:
from urllib2 import urlopen
from urlparse import urlparse as parse_url
from urlparse import uses_relative, uses_netloc, uses_params
_VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
@ -90,11 +83,10 @@ def download_url(url):
""" """
filename = "{}.pdf".format(random_string(6)) filename = "{}.pdf".format(random_string(6))
with tempfile.NamedTemporaryFile("wb", delete=False) as f: with tempfile.NamedTemporaryFile("wb", delete=False) as f:
obj = urlopen(url) headers = {"User-Agent": "Mozilla/5.0"}
if PY3: request = Request(url, None, headers)
obj = urlopen(request)
content_type = obj.info().get_content_type() content_type = obj.info().get_content_type()
else:
content_type = obj.info().getheader("Content-Type")
if content_type != "application/pdf": if content_type != "application/pdf":
raise NotImplementedError("File format not supported") raise NotImplementedError("File format not supported")
f.write(obj.read()) f.write(obj.read())

View File

@ -4,16 +4,6 @@ from __future__ import unicode_literals
data_stream = [ data_stream = [
[
"",
"Table: 5 Public Health Outlay 2012-13 (Budget Estimates) (Rs. in 000)",
"",
"",
"",
"",
"",
"",
],
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"], ["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
["", "", "", "", "", "Revenue &", "", ""], ["", "", "", "", "", "Revenue &", "", ""],
["", "Medical &", "Family", "Medical &", "Family", "", "", ""], ["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
@ -829,18 +819,6 @@ data_stream_table_rotated = [
] ]
data_stream_two_tables_1 = [ data_stream_two_tables_1 = [
[
"[In thousands (11,062.6 represents 11,062,600) For year ending December 31. Based on Uniform Crime Reporting (UCR)",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
[ [
"Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated", "Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated",
"", "",
@ -1300,29 +1278,10 @@ data_stream_two_tables_1 = [
"", "",
"", "",
], ],
[
"",
"Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.",
"",
"",
"",
"",
"",
"",
"",
"",
],
] ]
data_stream_two_tables_2 = [ data_stream_two_tables_2 = [
[
"",
"Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.",
"",
"",
"",
"",
],
["Table 325. Arrests by Race: 2009", "", "", "", "", ""], ["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
[ [
"[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies", "[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies",
@ -1600,16 +1559,9 @@ data_stream_two_tables_2 = [
"3,950", "3,950",
], ],
["1 Except forcible rape and prostitution.", "", "", "", "", ""], ["1 Except forcible rape and prostitution.", "", "", "", "", ""],
[
"",
"Source: U.S. Department of Justice, Federal Bureau of Investigation, “Crime in the United States, Arrests,” September 2010,",
"",
"",
"",
"",
],
] ]
data_stream_table_areas = [ data_stream_table_areas = [
["", "One Withholding"], ["", "One Withholding"],
["Payroll Period", "Allowance"], ["Payroll Period", "Allowance"],
@ -1776,18 +1728,7 @@ data_stream_columns = [
] ]
data_stream_split_text = [ data_stream_split_text = [
[ ["FEB", "RUAR", "Y 2014 M27 (BUS)", "", "", "", "", "", "", ""],
"FEB",
"RUAR",
"Y 2014 M27 (BUS)",
"",
"ALPHABETIC LISTING BY T",
"YPE",
"",
"",
"",
"ABLPDM27",
],
["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"], ["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"],
["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""], ["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""],
["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""], ["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""],
@ -2121,6 +2062,7 @@ data_stream_split_text = [
], ],
] ]
data_stream_flag_size = [ data_stream_flag_size = [
[ [
"States", "States",
@ -2820,7 +2762,7 @@ data_arabic = [
] ]
data_stream_layout_kwargs = [ data_stream_layout_kwargs = [
["V i n s a u Ve r r e", ""], ["V i n s a u V e r r e", ""],
["Les Blancs", "12.5CL"], ["Les Blancs", "12.5CL"],
["A.O.P Côtes du Rhône", ""], ["A.O.P Côtes du Rhône", ""],
["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"], ["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.7 KiB

After

Width:  |  Height:  |  Size: 6.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 13 KiB

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.8 KiB

After

Width:  |  Height:  |  Size: 8.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

After

Width:  |  Height:  |  Size: 19 KiB