blob: ebc41184f83d58316e752f3dd71e1a484d97fcbb [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
Andrew Svetlovf7a17b42012-12-25 16:47:37 +020021OSError); for HTTP errors, raises an HTTPError, which can also be
Jeremy Hylton1afc1692008-06-18 20:49:58 +000022treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
Jeremy Hylton1afc1692008-06-18 20:49:58 +000092import re
93import socket
Martin Pantere6f06092016-05-16 01:14:20 +000094import string
Jeremy Hylton1afc1692008-06-18 20:49:58 +000095import sys
96import time
Senthil Kumarane24f96a2012-03-13 19:29:33 -070097import tempfile
98import contextlib
Senthil Kumaran38b968b92012-03-14 13:43:53 -070099import warnings
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700100
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000101
Georg Brandl13e89462008-07-01 19:56:00 +0000102from urllib.error import URLError, HTTPError, ContentTooShortError
103from urllib.parse import (
Rémi Lapeyre674ee122019-05-27 15:43:45 +0200104 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
Cheryl Sabella0250de42018-04-25 16:51:54 -0700105 _splittype, _splithost, _splitport, _splituser, _splitpasswd,
106 _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,
Antoine Pitroudf204be2012-11-24 17:59:08 +0100107 unquote_to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000108from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000109
110# check for SSL
111try:
112 import ssl
Brett Cannoncd171c82013-07-04 17:43:24 -0400113except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000114 _have_ssl = False
115else:
116 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000117
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800118__all__ = [
119 # Classes
120 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
121 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
122 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
R David Murray4c7f9952015-04-16 16:36:18 -0400123 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
124 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
125 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
126 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800127 'UnknownHandler', 'HTTPErrorProcessor',
128 # Functions
129 'urlopen', 'install_opener', 'build_opener',
130 'pathname2url', 'url2pathname', 'getproxies',
131 # Legacy interface
132 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
133]
134
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000135# used in User-Agent header sent
Serhiy Storchaka885bdc42016-02-11 13:10:36 +0200136__version__ = '%d.%d' % sys.version_info[:2]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000137
138_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
Senthil Kumarana5c85b32014-09-19 15:23:30 +0800140 *, cafile=None, capath=None, cadefault=False, context=None):
Raymond Hettinger507343a2015-08-18 00:35:52 -0700141 '''Open the URL url, which can be either a string or a Request object.
142
Martin Panter3c0d0ba2016-08-24 06:33:33 +0000143 *data* must be an object specifying additional data to be sent to
144 the server, or None if no such data is needed. See Request for
145 details.
Raymond Hettinger507343a2015-08-18 00:35:52 -0700146
147 urllib.request module uses HTTP/1.1 and includes a "Connection:close"
148 header in its HTTP requests.
149
150 The optional *timeout* parameter specifies a timeout in seconds for
151 blocking operations like the connection attempt (if not specified, the
152 global default timeout setting will be used). This only works for HTTP,
153 HTTPS and FTP connections.
154
155 If *context* is specified, it must be a ssl.SSLContext instance describing
156 the various SSL options. See HTTPSConnection for more details.
157
158 The optional *cafile* and *capath* parameters specify a set of trusted CA
159 certificates for HTTPS requests. cafile should point to a single file
160 containing a bundle of CA certificates, whereas capath should point to a
161 directory of hashed certificate files. More information can be found in
162 ssl.SSLContext.load_verify_locations().
163
164 The *cadefault* parameter is ignored.
165
Raymond Hettinger507343a2015-08-18 00:35:52 -0700166
Ashwin Ramaswamiff2e1822019-09-13 04:40:08 -0700167 This function always returns an object which can work as a
168 context manager and has the properties url, headers, and status.
169 See urllib.response.addinfourl for more detail on these properties.
Raymond Hettinger507343a2015-08-18 00:35:52 -0700170
Martin Panter29f256902016-06-04 05:06:34 +0000171 For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
172 object slightly modified. In addition to the three new methods above, the
173 msg attribute contains the same information as the reason attribute ---
174 the reason phrase returned by the server --- instead of the response
175 headers as it is specified in the documentation for HTTPResponse.
R David Murrayd2367c62016-06-03 20:16:06 -0400176
Martin Panter29f256902016-06-04 05:06:34 +0000177 For FTP, file, and data URLs and requests explicitly handled by legacy
178 URLopener and FancyURLopener classes, this function returns a
179 urllib.response.addinfourl object.
180
181 Note that None may be returned if no handler handles the request (though
Raymond Hettinger507343a2015-08-18 00:35:52 -0700182 the default installed global OpenerDirector uses UnknownHandler to ensure
183 this never happens).
184
185 In addition, if proxy settings are detected (for example, when a *_proxy
186 environment variable like http_proxy is set), ProxyHandler is default
187 installed and makes sure the requests are handled through the proxy.
188
189 '''
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000190 global _opener
Antoine Pitroude9ac6c2012-05-16 21:40:01 +0200191 if cafile or capath or cadefault:
Christian Heimesd0486372016-09-10 23:23:33 +0200192 import warnings
Boštjan Mejak15869582018-11-25 19:32:50 +0100193 warnings.warn("cafile, capath and cadefault are deprecated, use a "
Christian Heimesd0486372016-09-10 23:23:33 +0200194 "custom context instead.", DeprecationWarning, 2)
Senthil Kumarana5c85b32014-09-19 15:23:30 +0800195 if context is not None:
196 raise ValueError(
197 "You can't pass both context and any of cafile, capath, and "
198 "cadefault"
199 )
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000200 if not _have_ssl:
201 raise ValueError('SSL support not available')
Benjamin Petersonb6666972014-12-07 13:46:02 -0500202 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
Christian Heimes67986f92013-11-23 22:43:47 +0100203 cafile=cafile,
204 capath=capath)
Benjamin Petersonb6666972014-12-07 13:46:02 -0500205 https_handler = HTTPSHandler(context=context)
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000206 opener = build_opener(https_handler)
Senthil Kumarana5c85b32014-09-19 15:23:30 +0800207 elif context:
208 https_handler = HTTPSHandler(context=context)
209 opener = build_opener(https_handler)
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000210 elif _opener is None:
211 _opener = opener = build_opener()
212 else:
213 opener = _opener
214 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000215
216def install_opener(opener):
217 global _opener
218 _opener = opener
219
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700220_url_tempfiles = []
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000221def urlretrieve(url, filename=None, reporthook=None, data=None):
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700222 """
223 Retrieve a URL into a temporary location on disk.
224
225 Requires a URL argument. If a filename is passed, it is used as
226 the temporary file location. The reporthook argument should be
227 a callable that accepts a block number, a read size, and the
228 total file size of the URL target. The data argument should be
229 valid URL encoded data.
230
231 If a filename is passed and the URL points to a local resource,
232 the result is a copy from local file to new file.
233
234 Returns a tuple containing the path to the newly created
235 data file as well as the resulting HTTPMessage object.
236 """
Cheryl Sabella0250de42018-04-25 16:51:54 -0700237 url_type, path = _splittype(url)
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700238
239 with contextlib.closing(urlopen(url, data)) as fp:
240 headers = fp.info()
241
242 # Just return the local path and the "headers" for file://
243 # URLs. No sense in performing a copy unless requested.
244 if url_type == "file" and not filename:
245 return os.path.normpath(path), headers
246
247 # Handle temporary file setup.
248 if filename:
249 tfp = open(filename, 'wb')
250 else:
251 tfp = tempfile.NamedTemporaryFile(delete=False)
252 filename = tfp.name
253 _url_tempfiles.append(filename)
254
255 with tfp:
256 result = filename, headers
257 bs = 1024*8
258 size = -1
259 read = 0
260 blocknum = 0
261 if "content-length" in headers:
262 size = int(headers["Content-Length"])
263
264 if reporthook:
Gregory P. Smith6b0bdab2012-11-10 13:43:44 -0800265 reporthook(blocknum, bs, size)
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700266
267 while True:
268 block = fp.read(bs)
269 if not block:
270 break
271 read += len(block)
272 tfp.write(block)
273 blocknum += 1
274 if reporthook:
Gregory P. Smith6b0bdab2012-11-10 13:43:44 -0800275 reporthook(blocknum, bs, size)
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700276
277 if size >= 0 and read < size:
278 raise ContentTooShortError(
279 "retrieval incomplete: got only %i out of %i bytes"
280 % (read, size), result)
281
282 return result
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000283
284def urlcleanup():
Robert Collins2fee5c92015-08-04 12:52:06 +1200285 """Clean up temporary files from urlretrieve calls."""
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700286 for temp_file in _url_tempfiles:
287 try:
288 os.unlink(temp_file)
Andrew Svetlov3438fa42012-12-17 23:35:18 +0200289 except OSError:
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700290 pass
291
292 del _url_tempfiles[:]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000293 global _opener
294 if _opener:
295 _opener = None
296
297# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000298_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000299def request_host(request):
300 """Return request-host, as defined by RFC 2965.
301
302 Variation from RFC: returned value is lowercased, for convenient
303 comparison.
304
305 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000306 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000307 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000308 if host == "":
309 host = request.get_header("Host", "")
310
311 # remove port, if present
312 host = _cut_port_re.sub("", host, 1)
313 return host.lower()
314
315class Request:
316
317 def __init__(self, url, data=None, headers={},
Senthil Kumarande49d642011-10-16 23:54:44 +0800318 origin_req_host=None, unverifiable=False,
319 method=None):
Senthil Kumaran52380922013-04-25 05:45:48 -0700320 self.full_url = url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000321 self.headers = {}
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200322 self.unredirected_hdrs = {}
323 self._data = None
324 self.data = data
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000325 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000326 for key, value in headers.items():
327 self.add_header(key, value)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000328 if origin_req_host is None:
329 origin_req_host = request_host(self)
330 self.origin_req_host = origin_req_host
331 self.unverifiable = unverifiable
Jason R. Coombs7dc4f4b2013-09-08 12:47:07 -0400332 if method:
333 self.method = method
Senthil Kumaran52380922013-04-25 05:45:48 -0700334
335 @property
336 def full_url(self):
Senthil Kumaran83070752013-05-24 09:14:12 -0700337 if self.fragment:
338 return '{}#{}'.format(self._full_url, self.fragment)
Senthil Kumaran52380922013-04-25 05:45:48 -0700339 return self._full_url
340
341 @full_url.setter
342 def full_url(self, url):
343 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Rémi Lapeyre674ee122019-05-27 15:43:45 +0200344 self._full_url = unwrap(url)
Cheryl Sabella0250de42018-04-25 16:51:54 -0700345 self._full_url, self.fragment = _splittag(self._full_url)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000346 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000347
Senthil Kumaran52380922013-04-25 05:45:48 -0700348 @full_url.deleter
349 def full_url(self):
350 self._full_url = None
351 self.fragment = None
352 self.selector = ''
353
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200354 @property
355 def data(self):
356 return self._data
357
358 @data.setter
359 def data(self, data):
360 if data != self._data:
361 self._data = data
362 # issue 16464
363 # if we change data we need to remove content-length header
364 # (cause it's most probably calculated for previous value)
365 if self.has_header("Content-length"):
366 self.remove_header("Content-length")
367
368 @data.deleter
369 def data(self):
R David Murray9cc7d452013-03-20 00:10:51 -0400370 self.data = None
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200371
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000372 def _parse(self):
Cheryl Sabella0250de42018-04-25 16:51:54 -0700373 self.type, rest = _splittype(self._full_url)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000374 if self.type is None:
R David Murrayd8a46962013-04-03 06:58:34 -0400375 raise ValueError("unknown url type: %r" % self.full_url)
Cheryl Sabella0250de42018-04-25 16:51:54 -0700376 self.host, self.selector = _splithost(rest)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000377 if self.host:
378 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000379
380 def get_method(self):
Senthil Kumarande49d642011-10-16 23:54:44 +0800381 """Return a string indicating the HTTP request method."""
Jason R. Coombsaae6a1d2013-09-08 12:54:33 -0400382 default_method = "POST" if self.data is not None else "GET"
383 return getattr(self, 'method', default_method)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000384
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000385 def get_full_url(self):
Senthil Kumaran52380922013-04-25 05:45:48 -0700386 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000387
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000388 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000389 if self.type == 'https' and not self._tunnel_host:
390 self._tunnel_host = self.host
391 else:
392 self.type= type
393 self.selector = self.full_url
394 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000395
396 def has_proxy(self):
397 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000398
399 def add_header(self, key, val):
400 # useful for something like authentication
401 self.headers[key.capitalize()] = val
402
403 def add_unredirected_header(self, key, val):
404 # will not be added to a redirected request
405 self.unredirected_hdrs[key.capitalize()] = val
406
407 def has_header(self, header_name):
408 return (header_name in self.headers or
409 header_name in self.unredirected_hdrs)
410
411 def get_header(self, header_name, default=None):
412 return self.headers.get(
413 header_name,
414 self.unredirected_hdrs.get(header_name, default))
415
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200416 def remove_header(self, header_name):
417 self.headers.pop(header_name, None)
418 self.unredirected_hdrs.pop(header_name, None)
419
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000420 def header_items(self):
Serhiy Storchakada084702019-03-27 08:02:28 +0200421 hdrs = {**self.unredirected_hdrs, **self.headers}
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000422 return list(hdrs.items())
423
424class OpenerDirector:
425 def __init__(self):
426 client_version = "Python-urllib/%s" % __version__
427 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000428 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000429 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000430 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000431 self.handle_open = {}
432 self.handle_error = {}
433 self.process_response = {}
434 self.process_request = {}
435
436 def add_handler(self, handler):
437 if not hasattr(handler, "add_parent"):
438 raise TypeError("expected BaseHandler instance, got %r" %
439 type(handler))
440
441 added = False
442 for meth in dir(handler):
443 if meth in ["redirect_request", "do_open", "proxy_open"]:
444 # oops, coincidental match
445 continue
446
447 i = meth.find("_")
448 protocol = meth[:i]
449 condition = meth[i+1:]
450
451 if condition.startswith("error"):
452 j = condition.find("_") + i + 1
453 kind = meth[j+1:]
454 try:
455 kind = int(kind)
456 except ValueError:
457 pass
458 lookup = self.handle_error.get(protocol, {})
459 self.handle_error[protocol] = lookup
460 elif condition == "open":
461 kind = protocol
462 lookup = self.handle_open
463 elif condition == "response":
464 kind = protocol
465 lookup = self.process_response
466 elif condition == "request":
467 kind = protocol
468 lookup = self.process_request
469 else:
470 continue
471
472 handlers = lookup.setdefault(kind, [])
473 if handlers:
474 bisect.insort(handlers, handler)
475 else:
476 handlers.append(handler)
477 added = True
478
479 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000480 bisect.insort(self.handlers, handler)
481 handler.add_parent(self)
482
483 def close(self):
484 # Only exists for backwards compatibility.
485 pass
486
487 def _call_chain(self, chain, kind, meth_name, *args):
488 # Handlers raise an exception if no one else should try to handle
489 # the request, or return None if they can't but another handler
490 # could. Otherwise, they return the response.
491 handlers = chain.get(kind, ())
492 for handler in handlers:
493 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000494 result = func(*args)
495 if result is not None:
496 return result
497
498 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
499 # accept a URL or a Request object
500 if isinstance(fullurl, str):
501 req = Request(fullurl, data)
502 else:
503 req = fullurl
504 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000505 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000506
507 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000508 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000509
510 # pre-process request
511 meth_name = protocol+"_request"
512 for processor in self.process_request.get(protocol, []):
513 meth = getattr(processor, meth_name)
514 req = meth(req)
515
Steve Dowerb82e17e2019-05-23 08:45:22 -0700516 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000517 response = self._open(req, data)
518
519 # post-process response
520 meth_name = protocol+"_response"
521 for processor in self.process_response.get(protocol, []):
522 meth = getattr(processor, meth_name)
523 response = meth(req, response)
524
525 return response
526
527 def _open(self, req, data=None):
528 result = self._call_chain(self.handle_open, 'default',
529 'default_open', req)
530 if result:
531 return result
532
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000533 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000534 result = self._call_chain(self.handle_open, protocol, protocol +
535 '_open', req)
536 if result:
537 return result
538
539 return self._call_chain(self.handle_open, 'unknown',
540 'unknown_open', req)
541
542 def error(self, proto, *args):
543 if proto in ('http', 'https'):
544 # XXX http[s] protocols are special-cased
545 dict = self.handle_error['http'] # https is not different than http
546 proto = args[2] # YUCK!
547 meth_name = 'http_error_%s' % proto
548 http_err = 1
549 orig_args = args
550 else:
551 dict = self.handle_error
552 meth_name = proto + '_error'
553 http_err = 0
554 args = (dict, proto, meth_name) + args
555 result = self._call_chain(*args)
556 if result:
557 return result
558
559 if http_err:
560 args = (dict, 'default', 'http_error_default') + orig_args
561 return self._call_chain(*args)
562
563# XXX probably also want an abstract factory that knows when it makes
564# sense to skip a superclass in favor of a subclass and when it might
565# make sense to include both
566
567def build_opener(*handlers):
568 """Create an opener object from a list of handlers.
569
570 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000571 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000572
573 If any of the handlers passed as arguments are subclasses of the
574 default handlers, the default handlers will not be used.
575 """
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000576 opener = OpenerDirector()
577 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
578 HTTPDefaultErrorHandler, HTTPRedirectHandler,
Antoine Pitroudf204be2012-11-24 17:59:08 +0100579 FTPHandler, FileHandler, HTTPErrorProcessor,
580 DataHandler]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000581 if hasattr(http.client, "HTTPSConnection"):
582 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000583 skip = set()
584 for klass in default_classes:
585 for check in handlers:
Benjamin Peterson78c85382014-04-01 16:27:30 -0400586 if isinstance(check, type):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000587 if issubclass(check, klass):
588 skip.add(klass)
589 elif isinstance(check, klass):
590 skip.add(klass)
591 for klass in skip:
592 default_classes.remove(klass)
593
594 for klass in default_classes:
595 opener.add_handler(klass())
596
597 for h in handlers:
Benjamin Peterson5dd3cae2014-04-01 14:20:56 -0400598 if isinstance(h, type):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000599 h = h()
600 opener.add_handler(h)
601 return opener
602
603class BaseHandler:
604 handler_order = 500
605
606 def add_parent(self, parent):
607 self.parent = parent
608
609 def close(self):
610 # Only exists for backwards compatibility
611 pass
612
613 def __lt__(self, other):
614 if not hasattr(other, "handler_order"):
615 # Try to preserve the old behavior of having custom classes
616 # inserted after default ones (works only for custom user
617 # classes which are not aware of handler_order).
618 return True
619 return self.handler_order < other.handler_order
620
621
622class HTTPErrorProcessor(BaseHandler):
623 """Process HTTP error responses."""
624 handler_order = 1000 # after all other processing
625
626 def http_response(self, request, response):
627 code, msg, hdrs = response.code, response.msg, response.info()
628
629 # According to RFC 2616, "2xx" code indicates that the client's
630 # request was successfully received, understood, and accepted.
631 if not (200 <= code < 300):
632 response = self.parent.error(
633 'http', request, response, code, msg, hdrs)
634
635 return response
636
637 https_response = http_response
638
639class HTTPDefaultErrorHandler(BaseHandler):
640 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000641 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000642
643class HTTPRedirectHandler(BaseHandler):
644 # maximum number of redirections to any single URL
645 # this is needed because of the state that cookies introduce
646 max_repeats = 4
647 # maximum total number of redirections (regardless of URL) before
648 # assuming we're in a loop
649 max_redirections = 10
650
651 def redirect_request(self, req, fp, code, msg, headers, newurl):
652 """Return a Request or None in response to a redirect.
653
654 This is called by the http_error_30x methods when a
655 redirection response is received. If a redirection should
656 take place, return a new Request to allow http_error_30x to
657 perform the redirect. Otherwise, raise HTTPError if no-one
658 else should try to handle this url. Return None if you can't
659 but another Handler might.
660 """
661 m = req.get_method()
662 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
663 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000664 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000665
666 # Strictly (according to RFC 2616), 301 or 302 in response to
667 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000668 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000669 # essentially all clients do redirect in this case, so we do
670 # the same.
Martin Pantere6f06092016-05-16 01:14:20 +0000671
672 # Be conciliant with URIs containing a space. This is mainly
673 # redundant with the more complete encoding done in http_error_302(),
674 # but it is kept for compatibility with other callers.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000675 newurl = newurl.replace(' ', '%20')
Martin Pantere6f06092016-05-16 01:14:20 +0000676
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000677 CONTENT_HEADERS = ("content-length", "content-type")
Jon Dufresne39726282017-05-18 07:35:54 -0700678 newheaders = {k: v for k, v in req.headers.items()
679 if k.lower() not in CONTENT_HEADERS}
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000680 return Request(newurl,
681 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000682 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000683 unverifiable=True)
684
685 # Implementation note: To avoid the server sending us into an
686 # infinite loop, the request object needs to track what URLs we
687 # have already seen. Do this by adding a handler-specific
688 # attribute to the Request object.
689 def http_error_302(self, req, fp, code, msg, headers):
690 # Some servers (incorrectly) return multiple Location headers
691 # (so probably same goes for URI). Use first header.
692 if "location" in headers:
693 newurl = headers["location"]
694 elif "uri" in headers:
695 newurl = headers["uri"]
696 else:
697 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000698
699 # fix a possible malformed URL
700 urlparts = urlparse(newurl)
guido@google.coma119df92011-03-29 11:41:02 -0700701
702 # For security reasons we don't allow redirection to anything other
703 # than http, https or ftp.
704
Senthil Kumaran6497aa32012-01-04 13:46:59 +0800705 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800706 raise HTTPError(
707 newurl, code,
708 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
709 headers, fp)
guido@google.coma119df92011-03-29 11:41:02 -0700710
Martin Panterce6e0682016-05-16 01:07:13 +0000711 if not urlparts.path and urlparts.netloc:
Facundo Batistaf24802c2008-08-17 03:36:03 +0000712 urlparts = list(urlparts)
713 urlparts[2] = "/"
714 newurl = urlunparse(urlparts)
715
Martin Pantere6f06092016-05-16 01:14:20 +0000716 # http.client.parse_headers() decodes as ISO-8859-1. Recover the
717 # original bytes and percent-encode non-ASCII bytes, and any special
718 # characters such as the space.
719 newurl = quote(
720 newurl, encoding="iso-8859-1", safe=string.punctuation)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000721 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000722
723 # XXX Probably want to forget about the state of the current
724 # request, although that might interact poorly with other
725 # handlers that also use handler-specific request attributes
726 new = self.redirect_request(req, fp, code, msg, headers, newurl)
727 if new is None:
728 return
729
730 # loop detection
731 # .redirect_dict has a key url if url was previously visited.
732 if hasattr(req, 'redirect_dict'):
733 visited = new.redirect_dict = req.redirect_dict
734 if (visited.get(newurl, 0) >= self.max_repeats or
735 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000736 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000737 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000738 else:
739 visited = new.redirect_dict = req.redirect_dict = {}
740 visited[newurl] = visited.get(newurl, 0) + 1
741
742 # Don't close the fp until we are sure that we won't use it
743 # with HTTPError.
744 fp.read()
745 fp.close()
746
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000747 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000748
749 http_error_301 = http_error_303 = http_error_307 = http_error_302
750
751 inf_msg = "The HTTP server returned a redirect error that would " \
752 "lead to an infinite loop.\n" \
753 "The last 30x error message was:\n"
754
755
756def _parse_proxy(proxy):
757 """Return (scheme, user, password, host/port) given a URL or an authority.
758
759 If a URL is supplied, it must have an authority (host:port) component.
760 According to RFC 3986, having an authority component means the URL must
Senthil Kumarand8e24f12014-04-14 16:32:20 -0400761 have two slashes after the scheme.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000762 """
Cheryl Sabella0250de42018-04-25 16:51:54 -0700763 scheme, r_scheme = _splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000764 if not r_scheme.startswith("/"):
765 # authority
766 scheme = None
767 authority = proxy
768 else:
769 # URL
770 if not r_scheme.startswith("//"):
771 raise ValueError("proxy URL with no authority: %r" % proxy)
772 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
773 # and 3.3.), path is empty or starts with '/'
774 end = r_scheme.find("/", 2)
775 if end == -1:
776 end = None
777 authority = r_scheme[2:end]
Cheryl Sabella0250de42018-04-25 16:51:54 -0700778 userinfo, hostport = _splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000779 if userinfo is not None:
Cheryl Sabella0250de42018-04-25 16:51:54 -0700780 user, password = _splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000781 else:
782 user = password = None
783 return scheme, user, password, hostport
784
785class ProxyHandler(BaseHandler):
786 # Proxies must be in front
787 handler_order = 100
788
789 def __init__(self, proxies=None):
790 if proxies is None:
791 proxies = getproxies()
792 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
793 self.proxies = proxies
794 for type, url in proxies.items():
Zackery Spytzb761e3a2019-09-13 08:07:07 -0600795 type = type.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000796 setattr(self, '%s_open' % type,
Georg Brandlfcbdbf22012-06-24 19:56:31 +0200797 lambda r, proxy=url, type=type, meth=self.proxy_open:
798 meth(r, proxy, type))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000799
800 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000801 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000802 proxy_type, user, password, hostport = _parse_proxy(proxy)
803 if proxy_type is None:
804 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000805
806 if req.host and proxy_bypass(req.host):
807 return None
808
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000809 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000810 user_pass = '%s:%s' % (unquote(user),
811 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000812 creds = base64.b64encode(user_pass.encode()).decode("ascii")
813 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000814 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000815 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000816 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000817 # let other handlers take care of it
818 return None
819 else:
820 # need to start over, because the other handlers don't
821 # grok the proxy's URL type
822 # e.g. if we have a constructor arg proxies like so:
823 # {'http': 'ftp://proxy.example.com'}, we may end up turning
824 # a request for http://acme.example.com/a into one for
825 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000826 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000827
828class HTTPPasswordMgr:
829
830 def __init__(self):
831 self.passwd = {}
832
833 def add_password(self, realm, uri, user, passwd):
834 # uri could be a single URI or a sequence
835 if isinstance(uri, str):
836 uri = [uri]
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800837 if realm not in self.passwd:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000838 self.passwd[realm] = {}
839 for default_port in True, False:
840 reduced_uri = tuple(
Jon Dufresne39726282017-05-18 07:35:54 -0700841 self.reduce_uri(u, default_port) for u in uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000842 self.passwd[realm][reduced_uri] = (user, passwd)
843
844 def find_user_password(self, realm, authuri):
845 domains = self.passwd.get(realm, {})
846 for default_port in True, False:
847 reduced_authuri = self.reduce_uri(authuri, default_port)
848 for uris, authinfo in domains.items():
849 for uri in uris:
850 if self.is_suburi(uri, reduced_authuri):
851 return authinfo
852 return None, None
853
854 def reduce_uri(self, uri, default_port=True):
855 """Accept authority or URI and extract only the authority and path."""
856 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000857 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000858 if parts[1]:
859 # URI
860 scheme = parts[0]
861 authority = parts[1]
862 path = parts[2] or '/'
863 else:
864 # host or host:port
865 scheme = None
866 authority = uri
867 path = '/'
Cheryl Sabella0250de42018-04-25 16:51:54 -0700868 host, port = _splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000869 if default_port and port is None and scheme is not None:
870 dport = {"http": 80,
871 "https": 443,
872 }.get(scheme)
873 if dport is not None:
874 authority = "%s:%d" % (host, dport)
875 return authority, path
876
877 def is_suburi(self, base, test):
878 """Check if test is below base in a URI tree
879
880 Both args must be URIs in reduced form.
881 """
882 if base == test:
883 return True
884 if base[0] != test[0]:
885 return False
886 common = posixpath.commonprefix((base[1], test[1]))
887 if len(common) == len(base[1]):
888 return True
889 return False
890
891
892class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
893
894 def find_user_password(self, realm, authuri):
895 user, password = HTTPPasswordMgr.find_user_password(self, realm,
896 authuri)
897 if user is not None:
898 return user, password
899 return HTTPPasswordMgr.find_user_password(self, None, authuri)
900
901
R David Murray4c7f9952015-04-16 16:36:18 -0400902class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
903
904 def __init__(self, *args, **kwargs):
905 self.authenticated = {}
906 super().__init__(*args, **kwargs)
907
908 def add_password(self, realm, uri, user, passwd, is_authenticated=False):
909 self.update_authenticated(uri, is_authenticated)
910 # Add a default for prior auth requests
911 if realm is not None:
912 super().add_password(None, uri, user, passwd)
913 super().add_password(realm, uri, user, passwd)
914
915 def update_authenticated(self, uri, is_authenticated=False):
916 # uri could be a single URI or a sequence
917 if isinstance(uri, str):
918 uri = [uri]
919
920 for default_port in True, False:
921 for u in uri:
922 reduced_uri = self.reduce_uri(u, default_port)
923 self.authenticated[reduced_uri] = is_authenticated
924
925 def is_authenticated(self, authuri):
926 for default_port in True, False:
927 reduced_authuri = self.reduce_uri(authuri, default_port)
928 for uri in self.authenticated:
929 if self.is_suburi(uri, reduced_authuri):
930 return self.authenticated[uri]
931
932
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000933class AbstractBasicAuthHandler:
934
935 # XXX this allows for multiple auth-schemes, but will stupidly pick
936 # the last one with a realm specified.
937
938 # allow for double- and single-quoted realm values
939 # (single quotes are a violation of the RFC, but appear in the wild)
940 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
Senthil Kumaran34f3fcc2012-05-15 22:30:25 +0800941 'realm=(["\']?)([^"\']*)\\2', re.I)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000942
943 # XXX could pre-emptively send auth info already accepted (RFC 2617,
944 # end of section 2, and section 1.2 immediately after "credentials"
945 # production).
946
947 def __init__(self, password_mgr=None):
948 if password_mgr is None:
949 password_mgr = HTTPPasswordMgr()
950 self.passwd = password_mgr
951 self.add_password = self.passwd.add_password
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000952
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000953 def http_error_auth_reqed(self, authreq, host, req, headers):
954 # host may be an authority (without userinfo) or a URL with an
955 # authority
956 # XXX could be multiple headers
957 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000958
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000959 if authreq:
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800960 scheme = authreq.split()[0]
Senthil Kumaran1a129c82011-10-20 02:50:13 +0800961 if scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800962 raise ValueError("AbstractBasicAuthHandler does not"
963 " support the following scheme: '%s'" %
964 scheme)
965 else:
966 mo = AbstractBasicAuthHandler.rx.search(authreq)
967 if mo:
968 scheme, quote, realm = mo.groups()
Senthil Kumaran92a5bf02012-05-16 00:03:29 +0800969 if quote not in ['"',"'"]:
970 warnings.warn("Basic Auth Realm was unquoted",
971 UserWarning, 2)
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800972 if scheme.lower() == 'basic':
Senthil Kumaran78373762014-08-20 07:53:58 +0530973 return self.retry_http_basic_auth(host, req, realm)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000974
975 def retry_http_basic_auth(self, host, req, realm):
976 user, pw = self.passwd.find_user_password(realm, host)
977 if pw is not None:
978 raw = "%s:%s" % (user, pw)
979 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
Senthil Kumaran78373762014-08-20 07:53:58 +0530980 if req.get_header(self.auth_header, None) == auth:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000981 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000982 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000983 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000984 else:
985 return None
986
R David Murray4c7f9952015-04-16 16:36:18 -0400987 def http_request(self, req):
988 if (not hasattr(self.passwd, 'is_authenticated') or
989 not self.passwd.is_authenticated(req.full_url)):
990 return req
991
992 if not req.has_header('Authorization'):
993 user, passwd = self.passwd.find_user_password(None, req.full_url)
994 credentials = '{0}:{1}'.format(user, passwd).encode()
995 auth_str = base64.standard_b64encode(credentials).decode()
996 req.add_unredirected_header('Authorization',
997 'Basic {}'.format(auth_str.strip()))
998 return req
999
1000 def http_response(self, req, response):
1001 if hasattr(self.passwd, 'is_authenticated'):
1002 if 200 <= response.code < 300:
1003 self.passwd.update_authenticated(req.full_url, True)
1004 else:
1005 self.passwd.update_authenticated(req.full_url, False)
1006 return response
1007
1008 https_request = http_request
1009 https_response = http_response
1010
1011
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001012
1013class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1014
1015 auth_header = 'Authorization'
1016
1017 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001018 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001019 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001020 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001021 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001022
1023
1024class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1025
1026 auth_header = 'Proxy-authorization'
1027
1028 def http_error_407(self, req, fp, code, msg, headers):
1029 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +00001030 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001031 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1032 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001033 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001034 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001035 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001036 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001037
1038
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001039# Return n random bytes.
1040_randombytes = os.urandom
1041
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001042
1043class AbstractDigestAuthHandler:
1044 # Digest authentication is specified in RFC 2617.
1045
1046 # XXX The client does not inspect the Authentication-Info header
1047 # in a successful response.
1048
1049 # XXX It should be possible to test this implementation against
1050 # a mock server that just generates a static set of challenges.
1051
1052 # XXX qop="auth-int" supports is shaky
1053
1054 def __init__(self, passwd=None):
1055 if passwd is None:
1056 passwd = HTTPPasswordMgr()
1057 self.passwd = passwd
1058 self.add_password = self.passwd.add_password
1059 self.retried = 0
1060 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +00001061 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001062
1063 def reset_retry_count(self):
1064 self.retried = 0
1065
1066 def http_error_auth_reqed(self, auth_header, host, req, headers):
1067 authreq = headers.get(auth_header, None)
1068 if self.retried > 5:
1069 # Don't fail endlessly - if we failed once, we'll probably
1070 # fail a second time. Hm. Unless the Password Manager is
1071 # prompting for the information. Crap. This isn't great
1072 # but it's better than the current 'repeat until recursion
1073 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001074 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +00001075 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001076 else:
1077 self.retried += 1
1078 if authreq:
1079 scheme = authreq.split()[0]
1080 if scheme.lower() == 'digest':
1081 return self.retry_http_digest_auth(req, authreq)
Senthil Kumaran1a129c82011-10-20 02:50:13 +08001082 elif scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +08001083 raise ValueError("AbstractDigestAuthHandler does not support"
1084 " the following scheme: '%s'" % scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001085
1086 def retry_http_digest_auth(self, req, auth):
1087 token, challenge = auth.split(' ', 1)
1088 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1089 auth = self.get_authorization(req, chal)
1090 if auth:
1091 auth_val = 'Digest %s' % auth
1092 if req.headers.get(self.auth_header, None) == auth_val:
1093 return None
1094 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +00001095 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001096 return resp
1097
1098 def get_cnonce(self, nonce):
1099 # The cnonce-value is an opaque
1100 # quoted string value provided by the client and used by both client
1101 # and server to avoid chosen plaintext attacks, to provide mutual
1102 # authentication, and to provide some message integrity protection.
1103 # This isn't a fabulous effort, but it's probably Good Enough.
1104 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001105 b = s.encode("ascii") + _randombytes(8)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001106 dig = hashlib.sha1(b).hexdigest()
1107 return dig[:16]
1108
1109 def get_authorization(self, req, chal):
1110 try:
1111 realm = chal['realm']
1112 nonce = chal['nonce']
1113 qop = chal.get('qop')
1114 algorithm = chal.get('algorithm', 'MD5')
1115 # mod_digest doesn't send an opaque, even though it isn't
1116 # supposed to be optional
1117 opaque = chal.get('opaque', None)
1118 except KeyError:
1119 return None
1120
1121 H, KD = self.get_algorithm_impls(algorithm)
1122 if H is None:
1123 return None
1124
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001125 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001126 if user is None:
1127 return None
1128
1129 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001130 if req.data is not None:
1131 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001132 else:
1133 entdig = None
1134
1135 A1 = "%s:%s:%s" % (user, realm, pw)
1136 A2 = "%s:%s" % (req.get_method(),
1137 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001138 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001139 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +00001140 if nonce == self.last_nonce:
1141 self.nonce_count += 1
1142 else:
1143 self.nonce_count = 1
1144 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001145 ncvalue = '%08x' % self.nonce_count
1146 cnonce = self.get_cnonce(nonce)
1147 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1148 respdig = KD(H(A1), noncebit)
1149 elif qop is None:
1150 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1151 else:
1152 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +00001153 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001154
1155 # XXX should the partial digests be encoded too?
1156
1157 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001158 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001159 respdig)
1160 if opaque:
1161 base += ', opaque="%s"' % opaque
1162 if entdig:
1163 base += ', digest="%s"' % entdig
1164 base += ', algorithm="%s"' % algorithm
1165 if qop:
1166 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1167 return base
1168
1169 def get_algorithm_impls(self, algorithm):
1170 # lambdas assume digest modules are imported at the top level
1171 if algorithm == 'MD5':
1172 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1173 elif algorithm == 'SHA':
1174 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1175 # XXX MD5-sess
Berker Peksage88dd1c2016-03-06 16:16:40 +02001176 else:
1177 raise ValueError("Unsupported digest authentication "
1178 "algorithm %r" % algorithm)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001179 KD = lambda s, d: H("%s:%s" % (s, d))
1180 return H, KD
1181
1182 def get_entity_digest(self, data, chal):
1183 # XXX not implemented yet
1184 return None
1185
1186
1187class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1188 """An authentication protocol defined by RFC 2069
1189
1190 Digest authentication improves on basic authentication because it
1191 does not transmit passwords in the clear.
1192 """
1193
1194 auth_header = 'Authorization'
1195 handler_order = 490 # before Basic auth
1196
1197 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001198 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001199 retry = self.http_error_auth_reqed('www-authenticate',
1200 host, req, headers)
1201 self.reset_retry_count()
1202 return retry
1203
1204
1205class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1206
1207 auth_header = 'Proxy-Authorization'
1208 handler_order = 490 # before Basic auth
1209
1210 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001211 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001212 retry = self.http_error_auth_reqed('proxy-authenticate',
1213 host, req, headers)
1214 self.reset_retry_count()
1215 return retry
1216
1217class AbstractHTTPHandler(BaseHandler):
1218
1219 def __init__(self, debuglevel=0):
1220 self._debuglevel = debuglevel
1221
1222 def set_http_debuglevel(self, level):
1223 self._debuglevel = level
1224
Martin Panter3c0d0ba2016-08-24 06:33:33 +00001225 def _get_content_length(self, request):
1226 return http.client.HTTPConnection._get_content_length(
1227 request.data,
1228 request.get_method())
1229
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001230 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001231 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001232 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001233 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001234
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001235 if request.data is not None: # POST
1236 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001237 if isinstance(data, str):
Martin Panter3c0d0ba2016-08-24 06:33:33 +00001238 msg = "POST data should be bytes, an iterable of bytes, " \
1239 "or a file object. It cannot be of type str."
Senthil Kumaran6b3434a2012-03-15 18:11:16 -07001240 raise TypeError(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001241 if not request.has_header('Content-type'):
1242 request.add_unredirected_header(
1243 'Content-type',
1244 'application/x-www-form-urlencoded')
Martin Panter3c0d0ba2016-08-24 06:33:33 +00001245 if (not request.has_header('Content-length')
1246 and not request.has_header('Transfer-encoding')):
1247 content_length = self._get_content_length(request)
1248 if content_length is not None:
1249 request.add_unredirected_header(
1250 'Content-length', str(content_length))
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001251 else:
1252 request.add_unredirected_header(
Martin Panter3c0d0ba2016-08-24 06:33:33 +00001253 'Transfer-encoding', 'chunked')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001254
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001255 sel_host = host
1256 if request.has_proxy():
Cheryl Sabella0250de42018-04-25 16:51:54 -07001257 scheme, sel = _splittype(request.selector)
1258 sel_host, sel_path = _splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001259 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001260 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001261 for name, value in self.parent.addheaders:
1262 name = name.capitalize()
1263 if not request.has_header(name):
1264 request.add_unredirected_header(name, value)
1265
1266 return request
1267
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001268 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001269 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001270
1271 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001272 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001273 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001274 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001275 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001276
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001277 # will parse host:port
1278 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran9642eed2016-05-13 01:32:42 -07001279 h.set_debuglevel(self._debuglevel)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001280
1281 headers = dict(req.unredirected_hdrs)
Serhiy Storchaka3f2e6f12018-02-26 16:50:11 +02001282 headers.update({k: v for k, v in req.headers.items()
1283 if k not in headers})
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001284
1285 # TODO(jhylton): Should this be redesigned to handle
1286 # persistent connections?
1287
1288 # We want to make an HTTP/1.1 request, but the addinfourl
1289 # class isn't prepared to deal with a persistent connection.
1290 # It will try to read all remaining data from the socket,
1291 # which will block while the server waits for the next request.
1292 # So make sure the connection gets closed after the (only)
1293 # request.
1294 headers["Connection"] = "close"
Jon Dufresne39726282017-05-18 07:35:54 -07001295 headers = {name.title(): val for name, val in headers.items()}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001296
1297 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001298 tunnel_headers = {}
1299 proxy_auth_hdr = "Proxy-Authorization"
1300 if proxy_auth_hdr in headers:
1301 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1302 # Proxy-Authorization should not be sent to origin
1303 # server.
1304 del headers[proxy_auth_hdr]
1305 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001306
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001307 try:
Serhiy Storchakaf54c3502014-09-06 21:41:39 +03001308 try:
Martin Panter3c0d0ba2016-08-24 06:33:33 +00001309 h.request(req.get_method(), req.selector, req.data, headers,
1310 encode_chunked=req.has_header('Transfer-encoding'))
Serhiy Storchakaf54c3502014-09-06 21:41:39 +03001311 except OSError as err: # timeout error
1312 raise URLError(err)
Senthil Kumaran45686b42011-07-27 09:31:03 +08001313 r = h.getresponse()
Serhiy Storchakaf54c3502014-09-06 21:41:39 +03001314 except:
1315 h.close()
1316 raise
1317
1318 # If the server does not send us a 'Connection: close' header,
1319 # HTTPConnection assumes the socket should be left open. Manually
1320 # mark the socket to be closed when this response object goes away.
1321 if h.sock:
1322 h.sock.close()
1323 h.sock = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001324
Senthil Kumaran26430412011-04-13 07:01:19 +08001325 r.url = req.get_full_url()
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001326 # This line replaces the .msg attribute of the HTTPResponse
1327 # with .headers, because urllib clients expect the response to
1328 # have the reason in .msg. It would be good to mark this
1329 # attribute is deprecated and get then to use info() or
1330 # .headers.
1331 r.msg = r.reason
1332 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001333
1334
1335class HTTPHandler(AbstractHTTPHandler):
1336
1337 def http_open(self, req):
1338 return self.do_open(http.client.HTTPConnection, req)
1339
1340 http_request = AbstractHTTPHandler.do_request_
1341
1342if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001343
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001344 class HTTPSHandler(AbstractHTTPHandler):
1345
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001346 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1347 AbstractHTTPHandler.__init__(self, debuglevel)
1348 self._context = context
1349 self._check_hostname = check_hostname
1350
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001351 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001352 return self.do_open(http.client.HTTPSConnection, req,
1353 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001354
1355 https_request = AbstractHTTPHandler.do_request_
1356
Senthil Kumaran4c875a92011-11-01 23:57:57 +08001357 __all__.append('HTTPSHandler')
Senthil Kumaran0d54eb92011-11-01 23:49:46 +08001358
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001359class HTTPCookieProcessor(BaseHandler):
1360 def __init__(self, cookiejar=None):
1361 import http.cookiejar
1362 if cookiejar is None:
1363 cookiejar = http.cookiejar.CookieJar()
1364 self.cookiejar = cookiejar
1365
1366 def http_request(self, request):
1367 self.cookiejar.add_cookie_header(request)
1368 return request
1369
1370 def http_response(self, request, response):
1371 self.cookiejar.extract_cookies(response, request)
1372 return response
1373
1374 https_request = http_request
1375 https_response = http_response
1376
1377class UnknownHandler(BaseHandler):
1378 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001379 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001380 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001381
1382def parse_keqv_list(l):
1383 """Parse list of key=value strings where keys are not duplicated."""
1384 parsed = {}
1385 for elt in l:
1386 k, v = elt.split('=', 1)
1387 if v[0] == '"' and v[-1] == '"':
1388 v = v[1:-1]
1389 parsed[k] = v
1390 return parsed
1391
1392def parse_http_list(s):
1393 """Parse lists as described by RFC 2068 Section 2.
1394
1395 In particular, parse comma-separated lists where the elements of
1396 the list may include quoted-strings. A quoted-string could
1397 contain a comma. A non-quoted string could have quotes in the
1398 middle. Neither commas nor quotes count if they are escaped.
1399 Only double-quotes count, not single-quotes.
1400 """
1401 res = []
1402 part = ''
1403
1404 escape = quote = False
1405 for cur in s:
1406 if escape:
1407 part += cur
1408 escape = False
1409 continue
1410 if quote:
1411 if cur == '\\':
1412 escape = True
1413 continue
1414 elif cur == '"':
1415 quote = False
1416 part += cur
1417 continue
1418
1419 if cur == ',':
1420 res.append(part)
1421 part = ''
1422 continue
1423
1424 if cur == '"':
1425 quote = True
1426
1427 part += cur
1428
1429 # append last part
1430 if part:
1431 res.append(part)
1432
1433 return [part.strip() for part in res]
1434
1435class FileHandler(BaseHandler):
1436 # Use local file or FTP depending on form of URL
1437 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001438 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001439 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1440 req.host != 'localhost'):
Senthil Kumaranbc07ac52014-07-22 00:15:20 -07001441 if not req.host in self.get_names():
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001442 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001443 else:
1444 return self.open_local_file(req)
1445
1446 # names for the localhost
1447 names = None
1448 def get_names(self):
1449 if FileHandler.names is None:
1450 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001451 FileHandler.names = tuple(
1452 socket.gethostbyname_ex('localhost')[2] +
1453 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001454 except socket.gaierror:
1455 FileHandler.names = (socket.gethostbyname('localhost'),)
1456 return FileHandler.names
1457
1458 # not entirely sure what the rules are here
1459 def open_local_file(self, req):
1460 import email.utils
1461 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001462 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001463 filename = req.selector
1464 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001465 try:
1466 stats = os.stat(localfile)
1467 size = stats.st_size
1468 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001469 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001470 headers = email.message_from_string(
1471 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1472 (mtype or 'text/plain', size, modified))
1473 if host:
Cheryl Sabella0250de42018-04-25 16:51:54 -07001474 host, port = _splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001475 if not host or \
1476 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001477 if host:
1478 origurl = 'file://' + host + filename
1479 else:
1480 origurl = 'file://' + filename
1481 return addinfourl(open(localfile, 'rb'), headers, origurl)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001482 except OSError as exp:
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001483 raise URLError(exp)
Georg Brandl13e89462008-07-01 19:56:00 +00001484 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001485
1486def _safe_gethostbyname(host):
1487 try:
1488 return socket.gethostbyname(host)
1489 except socket.gaierror:
1490 return None
1491
1492class FTPHandler(BaseHandler):
1493 def ftp_open(self, req):
1494 import ftplib
1495 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001496 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001497 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001498 raise URLError('ftp error: no host given')
Cheryl Sabella0250de42018-04-25 16:51:54 -07001499 host, port = _splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001500 if port is None:
1501 port = ftplib.FTP_PORT
1502 else:
1503 port = int(port)
1504
1505 # username/password handling
Cheryl Sabella0250de42018-04-25 16:51:54 -07001506 user, host = _splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001507 if user:
Cheryl Sabella0250de42018-04-25 16:51:54 -07001508 user, passwd = _splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001509 else:
1510 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001511 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001512 user = user or ''
1513 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001514
1515 try:
1516 host = socket.gethostbyname(host)
Andrew Svetlov0832af62012-12-18 23:10:48 +02001517 except OSError as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001518 raise URLError(msg)
Cheryl Sabella0250de42018-04-25 16:51:54 -07001519 path, attrs = _splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001520 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001521 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001522 dirs, file = dirs[:-1], dirs[-1]
1523 if dirs and not dirs[0]:
1524 dirs = dirs[1:]
1525 try:
1526 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1527 type = file and 'I' or 'D'
1528 for attr in attrs:
Cheryl Sabella0250de42018-04-25 16:51:54 -07001529 attr, value = _splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001530 if attr.lower() == 'type' and \
1531 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1532 type = value.upper()
1533 fp, retrlen = fw.retrfile(file, type)
1534 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001535 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001536 if mtype:
1537 headers += "Content-type: %s\n" % mtype
1538 if retrlen is not None and retrlen >= 0:
1539 headers += "Content-length: %d\n" % retrlen
1540 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001541 return addinfourl(fp, headers, req.full_url)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001542 except ftplib.all_errors as exp:
1543 exc = URLError('ftp error: %r' % exp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001544 raise exc.with_traceback(sys.exc_info()[2])
1545
1546 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001547 return ftpwrapper(user, passwd, host, port, dirs, timeout,
1548 persistent=False)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001549
1550class CacheFTPHandler(FTPHandler):
1551 # XXX would be nice to have pluggable cache strategies
1552 # XXX this stuff is definitely not thread safe
1553 def __init__(self):
1554 self.cache = {}
1555 self.timeout = {}
1556 self.soonest = 0
1557 self.delay = 60
1558 self.max_conns = 16
1559
1560 def setTimeout(self, t):
1561 self.delay = t
1562
1563 def setMaxConns(self, m):
1564 self.max_conns = m
1565
1566 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1567 key = user, host, port, '/'.join(dirs), timeout
1568 if key in self.cache:
1569 self.timeout[key] = time.time() + self.delay
1570 else:
1571 self.cache[key] = ftpwrapper(user, passwd, host, port,
1572 dirs, timeout)
1573 self.timeout[key] = time.time() + self.delay
1574 self.check_cache()
1575 return self.cache[key]
1576
1577 def check_cache(self):
1578 # first check for old ones
1579 t = time.time()
1580 if self.soonest <= t:
1581 for k, v in list(self.timeout.items()):
1582 if v < t:
1583 self.cache[k].close()
1584 del self.cache[k]
1585 del self.timeout[k]
1586 self.soonest = min(list(self.timeout.values()))
1587
1588 # then check the size
1589 if len(self.cache) == self.max_conns:
1590 for k, v in list(self.timeout.items()):
1591 if v == self.soonest:
1592 del self.cache[k]
1593 del self.timeout[k]
1594 break
1595 self.soonest = min(list(self.timeout.values()))
1596
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001597 def clear_cache(self):
1598 for conn in self.cache.values():
1599 conn.close()
1600 self.cache.clear()
1601 self.timeout.clear()
1602
Antoine Pitroudf204be2012-11-24 17:59:08 +01001603class DataHandler(BaseHandler):
1604 def data_open(self, req):
1605 # data URLs as specified in RFC 2397.
1606 #
1607 # ignores POSTed data
1608 #
1609 # syntax:
1610 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1611 # mediatype := [ type "/" subtype ] *( ";" parameter )
1612 # data := *urlchar
1613 # parameter := attribute "=" value
1614 url = req.full_url
1615
1616 scheme, data = url.split(":",1)
1617 mediatype, data = data.split(",",1)
1618
1619 # even base64 encoded data URLs might be quoted so unquote in any case:
1620 data = unquote_to_bytes(data)
1621 if mediatype.endswith(";base64"):
1622 data = base64.decodebytes(data)
1623 mediatype = mediatype[:-7]
1624
1625 if not mediatype:
1626 mediatype = "text/plain;charset=US-ASCII"
1627
1628 headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1629 (mediatype, len(data)))
1630
1631 return addinfourl(io.BytesIO(data), headers, url)
1632
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001633
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001634# Code move from the old urllib module
1635
1636MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1637
1638# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001639if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001640 from nturl2path import url2pathname, pathname2url
1641else:
1642 def url2pathname(pathname):
1643 """OS-specific conversion from a relative URL of the 'file' scheme
1644 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001645 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001646
1647 def pathname2url(pathname):
1648 """OS-specific conversion from a file system path to a relative URL
1649 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001650 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001651
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001652
1653ftpcache = {}
Senthil Kumarana2a9ddd2017-04-08 23:27:25 -07001654
1655
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001656class URLopener:
1657 """Class to open URLs.
1658 This is a class rather than just a subroutine because we may need
1659 more than one set of global protocol-specific options.
1660 Note -- this is a base class for those who don't want the
1661 automatic handling of errors type 302 (relocated) and 401
1662 (authorization needed)."""
1663
1664 __tempfiles = None
1665
1666 version = "Python-urllib/%s" % __version__
1667
1668 # Constructor
1669 def __init__(self, proxies=None, **x509):
Georg Brandlfcbdbf22012-06-24 19:56:31 +02001670 msg = "%(class)s style of invoking requests is deprecated. " \
Senthil Kumaran38b968b92012-03-14 13:43:53 -07001671 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1672 warnings.warn(msg, DeprecationWarning, stacklevel=3)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001673 if proxies is None:
1674 proxies = getproxies()
1675 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1676 self.proxies = proxies
1677 self.key_file = x509.get('key_file')
1678 self.cert_file = x509.get('cert_file')
Raymond Hettingerb7f3c942016-09-09 16:44:53 -07001679 self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001680 self.__tempfiles = []
1681 self.__unlink = os.unlink # See cleanup()
1682 self.tempcache = None
1683 # Undocumented feature: if you assign {} to tempcache,
1684 # it is used to cache files retrieved with
1685 # self.retrieve(). This is not enabled by default
1686 # since it does not work for changing documents (and I
1687 # haven't got the logic to check expiration headers
1688 # yet).
1689 self.ftpcache = ftpcache
1690 # Undocumented feature: you can use a different
1691 # ftp cache by assigning to the .ftpcache member;
1692 # in case you want logically independent URL openers
1693 # XXX This is not threadsafe. Bah.
1694
1695 def __del__(self):
1696 self.close()
1697
1698 def close(self):
1699 self.cleanup()
1700
1701 def cleanup(self):
1702 # This code sometimes runs when the rest of this module
1703 # has already been deleted, so it can't use any globals
1704 # or import anything.
1705 if self.__tempfiles:
1706 for file in self.__tempfiles:
1707 try:
1708 self.__unlink(file)
1709 except OSError:
1710 pass
1711 del self.__tempfiles[:]
1712 if self.tempcache:
1713 self.tempcache.clear()
1714
1715 def addheader(self, *args):
1716 """Add a header to be used by the HTTP interface only
1717 e.g. u.addheader('Accept', 'sound/basic')"""
1718 self.addheaders.append(args)
1719
1720 # External interface
1721 def open(self, fullurl, data=None):
1722 """Use URLopener().open(file) instead of open(file, 'r')."""
Rémi Lapeyre674ee122019-05-27 15:43:45 +02001723 fullurl = unwrap(_to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001724 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001725 if self.tempcache and fullurl in self.tempcache:
1726 filename, headers = self.tempcache[fullurl]
1727 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001728 return addinfourl(fp, headers, fullurl)
Cheryl Sabella0250de42018-04-25 16:51:54 -07001729 urltype, url = _splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001730 if not urltype:
1731 urltype = 'file'
1732 if urltype in self.proxies:
1733 proxy = self.proxies[urltype]
Cheryl Sabella0250de42018-04-25 16:51:54 -07001734 urltype, proxyhost = _splittype(proxy)
1735 host, selector = _splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001736 url = (host, fullurl) # Signal special case to open_*()
1737 else:
1738 proxy = None
1739 name = 'open_' + urltype
1740 self.type = urltype
1741 name = name.replace('-', '_')
Victor Stinner0c2b6a32019-05-22 22:15:01 +02001742 if not hasattr(self, name) or name == 'open_local_file':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001743 if proxy:
1744 return self.open_unknown_proxy(proxy, fullurl, data)
1745 else:
1746 return self.open_unknown(fullurl, data)
1747 try:
1748 if data is None:
1749 return getattr(self, name)(url)
1750 else:
1751 return getattr(self, name)(url, data)
Senthil Kumaranf5776862012-10-21 13:30:02 -07001752 except (HTTPError, URLError):
Antoine Pitrou6b4883d2011-10-12 02:54:14 +02001753 raise
Andrew Svetlov0832af62012-12-18 23:10:48 +02001754 except OSError as msg:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001755 raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001756
1757 def open_unknown(self, fullurl, data=None):
1758 """Overridable interface to open unknown URL type."""
Cheryl Sabella0250de42018-04-25 16:51:54 -07001759 type, url = _splittype(fullurl)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001760 raise OSError('url error', 'unknown url type', type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001761
1762 def open_unknown_proxy(self, proxy, fullurl, data=None):
1763 """Overridable interface to open unknown URL type."""
Cheryl Sabella0250de42018-04-25 16:51:54 -07001764 type, url = _splittype(fullurl)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001765 raise OSError('url error', 'invalid proxy for %s' % type, proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001766
1767 # External interface
1768 def retrieve(self, url, filename=None, reporthook=None, data=None):
1769 """retrieve(url) returns (filename, headers) for a local object
1770 or (tempfilename, headers) for a remote object."""
Rémi Lapeyre674ee122019-05-27 15:43:45 +02001771 url = unwrap(_to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001772 if self.tempcache and url in self.tempcache:
1773 return self.tempcache[url]
Cheryl Sabella0250de42018-04-25 16:51:54 -07001774 type, url1 = _splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001775 if filename is None and (not type or type == 'file'):
1776 try:
1777 fp = self.open_local_file(url1)
1778 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001779 fp.close()
Xtreakc661b302019-05-19 19:10:06 +05301780 return url2pathname(_splithost(url1)[1]), hdrs
Pablo Galindo293dd232019-11-19 21:34:03 +00001781 except OSError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001782 pass
1783 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001784 try:
1785 headers = fp.info()
1786 if filename:
1787 tfp = open(filename, 'wb')
1788 else:
Xtreakc661b302019-05-19 19:10:06 +05301789 garbage, path = _splittype(url)
1790 garbage, path = _splithost(path or "")
1791 path, garbage = _splitquery(path or "")
1792 path, garbage = _splitattr(path or "")
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001793 suffix = os.path.splitext(path)[1]
1794 (fd, filename) = tempfile.mkstemp(suffix)
1795 self.__tempfiles.append(filename)
1796 tfp = os.fdopen(fd, 'wb')
1797 try:
1798 result = filename, headers
1799 if self.tempcache is not None:
1800 self.tempcache[url] = result
1801 bs = 1024*8
1802 size = -1
1803 read = 0
1804 blocknum = 0
Senthil Kumarance260142011-11-01 01:35:17 +08001805 if "content-length" in headers:
1806 size = int(headers["Content-Length"])
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001807 if reporthook:
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001808 reporthook(blocknum, bs, size)
1809 while 1:
1810 block = fp.read(bs)
1811 if not block:
1812 break
1813 read += len(block)
1814 tfp.write(block)
1815 blocknum += 1
1816 if reporthook:
1817 reporthook(blocknum, bs, size)
1818 finally:
1819 tfp.close()
1820 finally:
1821 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001822
1823 # raise exception if actual size does not match content-length header
1824 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001825 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001826 "retrieval incomplete: got only %i out of %i bytes"
1827 % (read, size), result)
1828
1829 return result
1830
1831 # Each method named open_<type> knows how to open that type of URL
1832
1833 def _open_generic_http(self, connection_factory, url, data):
1834 """Make an HTTP connection using connection_class.
1835
1836 This is an internal method that should be called from
1837 open_http() or open_https().
1838
1839 Arguments:
1840 - connection_factory should take a host name and return an
1841 HTTPConnection instance.
1842 - url is the url to retrieval or a host, relative-path pair.
1843 - data is payload for a POST request or None.
1844 """
1845
1846 user_passwd = None
1847 proxy_passwd= None
1848 if isinstance(url, str):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001849 host, selector = _splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001850 if host:
Cheryl Sabella0250de42018-04-25 16:51:54 -07001851 user_passwd, host = _splituser(host)
Georg Brandl13e89462008-07-01 19:56:00 +00001852 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001853 realhost = host
1854 else:
1855 host, selector = url
1856 # check whether the proxy contains authorization information
Cheryl Sabella0250de42018-04-25 16:51:54 -07001857 proxy_passwd, host = _splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001858 # now we proceed with the url we want to obtain
Cheryl Sabella0250de42018-04-25 16:51:54 -07001859 urltype, rest = _splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001860 url = rest
1861 user_passwd = None
1862 if urltype.lower() != 'http':
1863 realhost = None
1864 else:
Cheryl Sabella0250de42018-04-25 16:51:54 -07001865 realhost, rest = _splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001866 if realhost:
Cheryl Sabella0250de42018-04-25 16:51:54 -07001867 user_passwd, realhost = _splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001868 if user_passwd:
1869 selector = "%s://%s%s" % (urltype, realhost, rest)
1870 if proxy_bypass(realhost):
1871 host = realhost
1872
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001873 if not host: raise OSError('http error', 'no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001874
1875 if proxy_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001876 proxy_passwd = unquote(proxy_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001877 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001878 else:
1879 proxy_auth = None
1880
1881 if user_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001882 user_passwd = unquote(user_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001883 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001884 else:
1885 auth = None
1886 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001887 headers = {}
1888 if proxy_auth:
1889 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1890 if auth:
1891 headers["Authorization"] = "Basic %s" % auth
1892 if realhost:
1893 headers["Host"] = realhost
Senthil Kumarand91ffca2011-03-19 17:25:27 +08001894
1895 # Add Connection:close as we don't support persistent connections yet.
1896 # This helps in closing the socket and avoiding ResourceWarning
1897
1898 headers["Connection"] = "close"
1899
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001900 for header, value in self.addheaders:
1901 headers[header] = value
1902
1903 if data is not None:
1904 headers["Content-Type"] = "application/x-www-form-urlencoded"
1905 http_conn.request("POST", selector, data, headers)
1906 else:
1907 http_conn.request("GET", selector, headers=headers)
1908
1909 try:
1910 response = http_conn.getresponse()
1911 except http.client.BadStatusLine:
1912 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001913 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001914
1915 # According to RFC 2616, "2xx" code indicates that the client's
1916 # request was successfully received, understood, and accepted.
1917 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001918 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001919 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001920 else:
1921 return self.http_error(
1922 url, response.fp,
1923 response.status, response.reason, response.msg, data)
1924
1925 def open_http(self, url, data=None):
1926 """Use HTTP protocol."""
1927 return self._open_generic_http(http.client.HTTPConnection, url, data)
1928
1929 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1930 """Handle http errors.
1931
1932 Derived class can override this, or provide specific handlers
1933 named http_error_DDD where DDD is the 3-digit error code."""
1934 # First check if there's a specific handler for this error
1935 name = 'http_error_%d' % errcode
1936 if hasattr(self, name):
1937 method = getattr(self, name)
1938 if data is None:
1939 result = method(url, fp, errcode, errmsg, headers)
1940 else:
1941 result = method(url, fp, errcode, errmsg, headers, data)
1942 if result: return result
1943 return self.http_error_default(url, fp, errcode, errmsg, headers)
1944
1945 def http_error_default(self, url, fp, errcode, errmsg, headers):
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001946 """Default error handler: close the connection and raise OSError."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001947 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001948 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001949
1950 if _have_ssl:
1951 def _https_connection(self, host):
1952 return http.client.HTTPSConnection(host,
1953 key_file=self.key_file,
1954 cert_file=self.cert_file)
1955
1956 def open_https(self, url, data=None):
1957 """Use HTTPS protocol."""
1958 return self._open_generic_http(self._https_connection, url, data)
1959
1960 def open_file(self, url):
1961 """Use local file or FTP depending on form of URL."""
1962 if not isinstance(url, str):
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001963 raise URLError('file error: proxy support for file protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001964 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001965 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001966 else:
1967 return self.open_local_file(url)
1968
1969 def open_local_file(self, url):
1970 """Use local file."""
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001971 import email.utils
1972 import mimetypes
Cheryl Sabella0250de42018-04-25 16:51:54 -07001973 host, file = _splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001974 localname = url2pathname(file)
1975 try:
1976 stats = os.stat(localname)
1977 except OSError as e:
Senthil Kumaranf5776862012-10-21 13:30:02 -07001978 raise URLError(e.strerror, e.filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001979 size = stats.st_size
1980 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1981 mtype = mimetypes.guess_type(url)[0]
1982 headers = email.message_from_string(
1983 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1984 (mtype or 'text/plain', size, modified))
1985 if not host:
1986 urlfile = file
1987 if file[:1] == '/':
1988 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001989 return addinfourl(open(localname, 'rb'), headers, urlfile)
Cheryl Sabella0250de42018-04-25 16:51:54 -07001990 host, port = _splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001991 if (not port
Senthil Kumaran40d80782012-10-22 09:43:04 -07001992 and socket.gethostbyname(host) in ((localhost(),) + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001993 urlfile = file
1994 if file[:1] == '/':
1995 urlfile = 'file://' + file
Senthil Kumaran3800ea92012-01-21 11:52:48 +08001996 elif file[:2] == './':
1997 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
Georg Brandl13e89462008-07-01 19:56:00 +00001998 return addinfourl(open(localname, 'rb'), headers, urlfile)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001999 raise URLError('local file error: not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002000
2001 def open_ftp(self, url):
2002 """Use FTP protocol."""
2003 if not isinstance(url, str):
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002004 raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002005 import mimetypes
Cheryl Sabella0250de42018-04-25 16:51:54 -07002006 host, path = _splithost(url)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002007 if not host: raise URLError('ftp error: no host given')
Cheryl Sabella0250de42018-04-25 16:51:54 -07002008 host, port = _splitport(host)
2009 user, host = _splituser(host)
2010 if user: user, passwd = _splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002011 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00002012 host = unquote(host)
2013 user = unquote(user or '')
2014 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002015 host = socket.gethostbyname(host)
2016 if not port:
2017 import ftplib
2018 port = ftplib.FTP_PORT
2019 else:
2020 port = int(port)
Cheryl Sabella0250de42018-04-25 16:51:54 -07002021 path, attrs = _splitattr(path)
Georg Brandl13e89462008-07-01 19:56:00 +00002022 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002023 dirs = path.split('/')
2024 dirs, file = dirs[:-1], dirs[-1]
2025 if dirs and not dirs[0]: dirs = dirs[1:]
2026 if dirs and not dirs[0]: dirs[0] = '/'
2027 key = user, host, port, '/'.join(dirs)
2028 # XXX thread unsafe!
2029 if len(self.ftpcache) > MAXFTPCACHE:
2030 # Prune the cache, rather arbitrarily
Benjamin Peterson3c2dca62014-06-07 15:08:04 -07002031 for k in list(self.ftpcache):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002032 if k != key:
2033 v = self.ftpcache[k]
2034 del self.ftpcache[k]
2035 v.close()
2036 try:
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002037 if key not in self.ftpcache:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002038 self.ftpcache[key] = \
2039 ftpwrapper(user, passwd, host, port, dirs)
2040 if not file: type = 'D'
2041 else: type = 'I'
2042 for attr in attrs:
Cheryl Sabella0250de42018-04-25 16:51:54 -07002043 attr, value = _splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002044 if attr.lower() == 'type' and \
2045 value in ('a', 'A', 'i', 'I', 'd', 'D'):
2046 type = value.upper()
2047 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2048 mtype = mimetypes.guess_type("ftp:" + url)[0]
2049 headers = ""
2050 if mtype:
2051 headers += "Content-Type: %s\n" % mtype
2052 if retrlen is not None and retrlen >= 0:
2053 headers += "Content-Length: %d\n" % retrlen
2054 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00002055 return addinfourl(fp, headers, "ftp:" + url)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002056 except ftperrors() as exp:
2057 raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002058
2059 def open_data(self, url, data=None):
2060 """Use "data" URL."""
2061 if not isinstance(url, str):
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002062 raise URLError('data error: proxy support for data protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002063 # ignore POSTed data
2064 #
2065 # syntax of data URLs:
2066 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
2067 # mediatype := [ type "/" subtype ] *( ";" parameter )
2068 # data := *urlchar
2069 # parameter := attribute "=" value
2070 try:
2071 [type, data] = url.split(',', 1)
2072 except ValueError:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002073 raise OSError('data error', 'bad data URL')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002074 if not type:
2075 type = 'text/plain;charset=US-ASCII'
2076 semi = type.rfind(';')
2077 if semi >= 0 and '=' not in type[semi:]:
2078 encoding = type[semi+1:]
2079 type = type[:semi]
2080 else:
2081 encoding = ''
2082 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00002083 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002084 time.gmtime(time.time())))
2085 msg.append('Content-type: %s' % type)
2086 if encoding == 'base64':
Georg Brandl706824f2009-06-04 09:42:55 +00002087 # XXX is this encoding/decoding ok?
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002088 data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002089 else:
Georg Brandl13e89462008-07-01 19:56:00 +00002090 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002091 msg.append('Content-Length: %d' % len(data))
2092 msg.append('')
2093 msg.append(data)
2094 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00002095 headers = email.message_from_string(msg)
2096 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002097 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00002098 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002099
2100
2101class FancyURLopener(URLopener):
2102 """Derived class with handlers for errors we can handle (perhaps)."""
2103
2104 def __init__(self, *args, **kwargs):
2105 URLopener.__init__(self, *args, **kwargs)
2106 self.auth_cache = {}
2107 self.tries = 0
2108 self.maxtries = 10
2109
2110 def http_error_default(self, url, fp, errcode, errmsg, headers):
2111 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00002112 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002113
2114 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2115 """Error 302 -- relocated (temporarily)."""
2116 self.tries += 1
Martin Pantera0370222016-02-04 06:01:35 +00002117 try:
2118 if self.maxtries and self.tries >= self.maxtries:
2119 if hasattr(self, "http_error_500"):
2120 meth = self.http_error_500
2121 else:
2122 meth = self.http_error_default
2123 return meth(url, fp, 500,
2124 "Internal Server Error: Redirect Recursion",
2125 headers)
2126 result = self.redirect_internal(url, fp, errcode, errmsg,
2127 headers, data)
2128 return result
2129 finally:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002130 self.tries = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002131
2132 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2133 if 'location' in headers:
2134 newurl = headers['location']
2135 elif 'uri' in headers:
2136 newurl = headers['uri']
2137 else:
2138 return
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002139 fp.close()
guido@google.coma119df92011-03-29 11:41:02 -07002140
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002141 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00002142 newurl = urljoin(self.type + ":" + url, newurl)
guido@google.coma119df92011-03-29 11:41:02 -07002143
2144 urlparts = urlparse(newurl)
2145
2146 # For security reasons, we don't allow redirection to anything other
2147 # than http, https and ftp.
2148
2149 # We are using newer HTTPError with older redirect_internal method
2150 # This older method will get deprecated in 3.3
2151
Senthil Kumaran6497aa32012-01-04 13:46:59 +08002152 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
guido@google.coma119df92011-03-29 11:41:02 -07002153 raise HTTPError(newurl, errcode,
2154 errmsg +
2155 " Redirection to url '%s' is not allowed." % newurl,
2156 headers, fp)
2157
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002158 return self.open(newurl)
2159
2160 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2161 """Error 301 -- also relocated (permanently)."""
2162 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2163
2164 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2165 """Error 303 -- also relocated (essentially identical to 302)."""
2166 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2167
2168 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2169 """Error 307 -- relocated, but turn POST into error."""
2170 if data is None:
2171 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2172 else:
2173 return self.http_error_default(url, fp, errcode, errmsg, headers)
2174
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002175 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2176 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002177 """Error 401 -- authentication required.
2178 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002179 if 'www-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002180 URLopener.http_error_default(self, url, fp,
2181 errcode, errmsg, headers)
2182 stuff = headers['www-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002183 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2184 if not match:
2185 URLopener.http_error_default(self, url, fp,
2186 errcode, errmsg, headers)
2187 scheme, realm = match.groups()
2188 if scheme.lower() != 'basic':
2189 URLopener.http_error_default(self, url, fp,
2190 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002191 if not retry:
2192 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2193 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002194 name = 'retry_' + self.type + '_basic_auth'
2195 if data is None:
2196 return getattr(self,name)(url, realm)
2197 else:
2198 return getattr(self,name)(url, realm, data)
2199
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002200 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2201 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002202 """Error 407 -- proxy authentication required.
2203 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002204 if 'proxy-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002205 URLopener.http_error_default(self, url, fp,
2206 errcode, errmsg, headers)
2207 stuff = headers['proxy-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002208 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2209 if not match:
2210 URLopener.http_error_default(self, url, fp,
2211 errcode, errmsg, headers)
2212 scheme, realm = match.groups()
2213 if scheme.lower() != 'basic':
2214 URLopener.http_error_default(self, url, fp,
2215 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002216 if not retry:
2217 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2218 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002219 name = 'retry_proxy_' + self.type + '_basic_auth'
2220 if data is None:
2221 return getattr(self,name)(url, realm)
2222 else:
2223 return getattr(self,name)(url, realm, data)
2224
2225 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Cheryl Sabella0250de42018-04-25 16:51:54 -07002226 host, selector = _splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002227 newurl = 'http://' + host + selector
2228 proxy = self.proxies['http']
Cheryl Sabella0250de42018-04-25 16:51:54 -07002229 urltype, proxyhost = _splittype(proxy)
2230 proxyhost, proxyselector = _splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002231 i = proxyhost.find('@') + 1
2232 proxyhost = proxyhost[i:]
2233 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2234 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002235 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002236 quote(passwd, safe=''), proxyhost)
2237 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2238 if data is None:
2239 return self.open(newurl)
2240 else:
2241 return self.open(newurl, data)
2242
2243 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Cheryl Sabella0250de42018-04-25 16:51:54 -07002244 host, selector = _splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002245 newurl = 'https://' + host + selector
2246 proxy = self.proxies['https']
Cheryl Sabella0250de42018-04-25 16:51:54 -07002247 urltype, proxyhost = _splittype(proxy)
2248 proxyhost, proxyselector = _splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002249 i = proxyhost.find('@') + 1
2250 proxyhost = proxyhost[i:]
2251 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2252 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002253 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002254 quote(passwd, safe=''), proxyhost)
2255 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2256 if data is None:
2257 return self.open(newurl)
2258 else:
2259 return self.open(newurl, data)
2260
2261 def retry_http_basic_auth(self, url, realm, data=None):
Cheryl Sabella0250de42018-04-25 16:51:54 -07002262 host, selector = _splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002263 i = host.find('@') + 1
2264 host = host[i:]
2265 user, passwd = self.get_user_passwd(host, realm, i)
2266 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002267 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002268 quote(passwd, safe=''), host)
2269 newurl = 'http://' + host + selector
2270 if data is None:
2271 return self.open(newurl)
2272 else:
2273 return self.open(newurl, data)
2274
2275 def retry_https_basic_auth(self, url, realm, data=None):
Cheryl Sabella0250de42018-04-25 16:51:54 -07002276 host, selector = _splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002277 i = host.find('@') + 1
2278 host = host[i:]
2279 user, passwd = self.get_user_passwd(host, realm, i)
2280 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002281 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002282 quote(passwd, safe=''), host)
2283 newurl = 'https://' + host + selector
2284 if data is None:
2285 return self.open(newurl)
2286 else:
2287 return self.open(newurl, data)
2288
Florent Xicluna757445b2010-05-17 17:24:07 +00002289 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002290 key = realm + '@' + host.lower()
2291 if key in self.auth_cache:
2292 if clear_cache:
2293 del self.auth_cache[key]
2294 else:
2295 return self.auth_cache[key]
2296 user, passwd = self.prompt_user_passwd(host, realm)
2297 if user or passwd: self.auth_cache[key] = (user, passwd)
2298 return user, passwd
2299
2300 def prompt_user_passwd(self, host, realm):
2301 """Override this in a GUI environment!"""
2302 import getpass
2303 try:
2304 user = input("Enter username for %s at %s: " % (realm, host))
2305 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2306 (user, realm, host))
2307 return user, passwd
2308 except KeyboardInterrupt:
2309 print()
2310 return None, None
2311
2312
2313# Utility functions
2314
2315_localhost = None
2316def localhost():
2317 """Return the IP address of the magic hostname 'localhost'."""
2318 global _localhost
2319 if _localhost is None:
2320 _localhost = socket.gethostbyname('localhost')
2321 return _localhost
2322
2323_thishost = None
2324def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002325 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002326 global _thishost
2327 if _thishost is None:
Senthil Kumarandcdadfe2013-06-01 11:12:17 -07002328 try:
2329 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2330 except socket.gaierror:
2331 _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002332 return _thishost
2333
2334_ftperrors = None
2335def ftperrors():
2336 """Return the set of errors raised by the FTP class."""
2337 global _ftperrors
2338 if _ftperrors is None:
2339 import ftplib
2340 _ftperrors = ftplib.all_errors
2341 return _ftperrors
2342
2343_noheaders = None
2344def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002345 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002346 global _noheaders
2347 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002348 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002349 return _noheaders
2350
2351
2352# Utility classes
2353
2354class ftpwrapper:
2355 """Class used by open_ftp() for cache of open FTP connections."""
2356
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002357 def __init__(self, user, passwd, host, port, dirs, timeout=None,
2358 persistent=True):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002359 self.user = user
2360 self.passwd = passwd
2361 self.host = host
2362 self.port = port
2363 self.dirs = dirs
2364 self.timeout = timeout
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002365 self.refcount = 0
2366 self.keepalive = persistent
Victor Stinnerab73e652015-04-07 12:49:27 +02002367 try:
2368 self.init()
2369 except:
2370 self.close()
2371 raise
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002372
2373 def init(self):
2374 import ftplib
2375 self.busy = 0
2376 self.ftp = ftplib.FTP()
2377 self.ftp.connect(self.host, self.port, self.timeout)
2378 self.ftp.login(self.user, self.passwd)
Senthil Kumarancaa00fe2013-06-02 11:59:47 -07002379 _target = '/'.join(self.dirs)
2380 self.ftp.cwd(_target)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002381
2382 def retrfile(self, file, type):
2383 import ftplib
2384 self.endtransfer()
2385 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2386 else: cmd = 'TYPE ' + type; isdir = 0
2387 try:
2388 self.ftp.voidcmd(cmd)
2389 except ftplib.all_errors:
2390 self.init()
2391 self.ftp.voidcmd(cmd)
2392 conn = None
2393 if file and not isdir:
2394 # Try to retrieve as a file
2395 try:
2396 cmd = 'RETR ' + file
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002397 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002398 except ftplib.error_perm as reason:
2399 if str(reason)[:3] != '550':
Benjamin Peterson901a2782013-05-12 19:01:52 -05002400 raise URLError('ftp error: %r' % reason).with_traceback(
Georg Brandl13e89462008-07-01 19:56:00 +00002401 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002402 if not conn:
2403 # Set transfer mode to ASCII!
2404 self.ftp.voidcmd('TYPE A')
2405 # Try a directory listing. Verify that directory exists.
2406 if file:
2407 pwd = self.ftp.pwd()
2408 try:
2409 try:
2410 self.ftp.cwd(file)
2411 except ftplib.error_perm as reason:
Benjamin Peterson901a2782013-05-12 19:01:52 -05002412 raise URLError('ftp error: %r' % reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002413 finally:
2414 self.ftp.cwd(pwd)
2415 cmd = 'LIST ' + file
2416 else:
2417 cmd = 'LIST'
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002418 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002419 self.busy = 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002420
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002421 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2422 self.refcount += 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002423 conn.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002424 # Pass back both a suitably decorated object and a retrieval length
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002425 return (ftpobj, retrlen)
2426
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002427 def endtransfer(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002428 self.busy = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002429
2430 def close(self):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002431 self.keepalive = False
2432 if self.refcount <= 0:
2433 self.real_close()
2434
2435 def file_close(self):
2436 self.endtransfer()
2437 self.refcount -= 1
2438 if self.refcount <= 0 and not self.keepalive:
2439 self.real_close()
2440
2441 def real_close(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002442 self.endtransfer()
2443 try:
2444 self.ftp.close()
2445 except ftperrors():
2446 pass
2447
2448# Proxy handling
2449def getproxies_environment():
2450 """Return a dictionary of scheme -> proxy server URL mappings.
2451
2452 Scan the environment for variables named <scheme>_proxy;
2453 this seems to be the standard convention. If you need a
2454 different way, you can pass a proxies dictionary to the
2455 [Fancy]URLopener constructor.
2456
2457 """
2458 proxies = {}
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002459 # in order to prefer lowercase variables, process environment in
2460 # two passes: first matches any, second pass matches lowercase only
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002461 for name, value in os.environ.items():
2462 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002463 if value and name[-6:] == '_proxy':
2464 proxies[name[:-6]] = value
Senthil Kumaran4cbb23f2016-07-30 23:24:16 -07002465 # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
2466 # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
2467 # header from the client
Senthil Kumaran17742f22016-07-30 23:39:06 -07002468 # If "proxy" is lowercase, it will still be used thanks to the next block
Senthil Kumaran4cbb23f2016-07-30 23:24:16 -07002469 if 'REQUEST_METHOD' in os.environ:
2470 proxies.pop('http', None)
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002471 for name, value in os.environ.items():
2472 if name[-6:] == '_proxy':
2473 name = name.lower()
2474 if value:
2475 proxies[name[:-6]] = value
2476 else:
2477 proxies.pop(name[:-6], None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002478 return proxies
2479
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002480def proxy_bypass_environment(host, proxies=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002481 """Test if proxies should not be used for a particular host.
2482
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002483 Checks the proxy dict for the value of no_proxy, which should
2484 be a list of comma separated DNS suffixes, or '*' for all hosts.
2485
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002486 """
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002487 if proxies is None:
2488 proxies = getproxies_environment()
2489 # don't bypass, if no_proxy isn't specified
2490 try:
2491 no_proxy = proxies['no']
2492 except KeyError:
2493 return 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002494 # '*' is special case for always bypass
2495 if no_proxy == '*':
2496 return 1
2497 # strip port off host
Cheryl Sabella0250de42018-04-25 16:51:54 -07002498 hostonly, port = _splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002499 # check if the host ends with any of the DNS suffixes
Senthil Kumaran89976f12011-08-06 12:27:40 +08002500 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2501 for name in no_proxy_list:
Martin Panteraa279822016-04-30 01:03:40 +00002502 if name:
Xiang Zhang959ff7f2017-01-09 11:47:55 +08002503 name = name.lstrip('.') # ignore leading dots
Martin Panteraa279822016-04-30 01:03:40 +00002504 name = re.escape(name)
2505 pattern = r'(.+\.)?%s$' % name
2506 if (re.match(pattern, hostonly, re.I)
2507 or re.match(pattern, host, re.I)):
2508 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002509 # otherwise, don't bypass
2510 return 0
2511
2512
Ronald Oussorene72e1612011-03-14 18:15:25 -04002513# This code tests an OSX specific data structure but is testable on all
2514# platforms
2515def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2516 """
2517 Return True iff this host shouldn't be accessed using a proxy
2518
2519 This function uses the MacOSX framework SystemConfiguration
2520 to fetch the proxy information.
2521
2522 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2523 { 'exclude_simple': bool,
2524 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2525 }
2526 """
Ronald Oussorene72e1612011-03-14 18:15:25 -04002527 from fnmatch import fnmatch
2528
Cheryl Sabella0250de42018-04-25 16:51:54 -07002529 hostonly, port = _splitport(host)
Ronald Oussorene72e1612011-03-14 18:15:25 -04002530
2531 def ip2num(ipAddr):
2532 parts = ipAddr.split('.')
2533 parts = list(map(int, parts))
2534 if len(parts) != 4:
2535 parts = (parts + [0, 0, 0, 0])[:4]
2536 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2537
2538 # Check for simple host names:
2539 if '.' not in host:
2540 if proxy_settings['exclude_simple']:
2541 return True
2542
2543 hostIP = None
2544
2545 for value in proxy_settings.get('exceptions', ()):
2546 # Items in the list are strings like these: *.local, 169.254/16
2547 if not value: continue
2548
2549 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2550 if m is not None:
2551 if hostIP is None:
2552 try:
2553 hostIP = socket.gethostbyname(hostonly)
2554 hostIP = ip2num(hostIP)
Andrew Svetlov0832af62012-12-18 23:10:48 +02002555 except OSError:
Ronald Oussorene72e1612011-03-14 18:15:25 -04002556 continue
2557
2558 base = ip2num(m.group(1))
2559 mask = m.group(2)
2560 if mask is None:
2561 mask = 8 * (m.group(1).count('.') + 1)
2562 else:
2563 mask = int(mask[1:])
2564 mask = 32 - mask
2565
2566 if (hostIP >> mask) == (base >> mask):
2567 return True
2568
2569 elif fnmatch(host, value):
2570 return True
2571
2572 return False
2573
2574
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002575if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002576 from _scproxy import _get_proxy_settings, _get_proxies
2577
2578 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren84151202010-04-18 20:46:11 +00002579 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002580 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren84151202010-04-18 20:46:11 +00002581
2582 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002583 """Return a dictionary of scheme -> proxy server URL mappings.
2584
Ronald Oussoren84151202010-04-18 20:46:11 +00002585 This function uses the MacOSX framework SystemConfiguration
2586 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002587 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002588 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002589
Ronald Oussoren84151202010-04-18 20:46:11 +00002590
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002591
2592 def proxy_bypass(host):
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002593 """Return True, if host should be bypassed.
2594
2595 Checks proxy settings gathered from the environment, if specified,
2596 or from the MacOSX framework SystemConfiguration.
2597
2598 """
2599 proxies = getproxies_environment()
2600 if proxies:
2601 return proxy_bypass_environment(host, proxies)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002602 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002603 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002604
2605 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002606 return getproxies_environment() or getproxies_macosx_sysconf()
2607
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002608
2609elif os.name == 'nt':
2610 def getproxies_registry():
2611 """Return a dictionary of scheme -> proxy server URL mappings.
2612
2613 Win32 uses the registry to store proxies.
2614
2615 """
2616 proxies = {}
2617 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002618 import winreg
Brett Cannoncd171c82013-07-04 17:43:24 -04002619 except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002620 # Std module, so should be around - but you never know!
2621 return proxies
2622 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002623 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002624 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002625 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002626 'ProxyEnable')[0]
2627 if proxyEnable:
2628 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002629 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002630 'ProxyServer')[0])
2631 if '=' in proxyServer:
2632 # Per-protocol settings
2633 for p in proxyServer.split(';'):
2634 protocol, address = p.split('=', 1)
2635 # See if address has a type:// prefix
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002636 if not re.match('^([^/:]+)://', address):
2637 address = '%s://%s' % (protocol, address)
2638 proxies[protocol] = address
2639 else:
2640 # Use one setting for all protocols
2641 if proxyServer[:5] == 'http:':
2642 proxies['http'] = proxyServer
2643 else:
2644 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002645 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002646 proxies['ftp'] = 'ftp://%s' % proxyServer
2647 internetSettings.Close()
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002648 except (OSError, ValueError, TypeError):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002649 # Either registry key not found etc, or the value in an
2650 # unexpected format.
2651 # proxies already set up to be empty so nothing to do
2652 pass
2653 return proxies
2654
2655 def getproxies():
2656 """Return a dictionary of scheme -> proxy server URL mappings.
2657
2658 Returns settings gathered from the environment, if specified,
2659 or the registry.
2660
2661 """
2662 return getproxies_environment() or getproxies_registry()
2663
2664 def proxy_bypass_registry(host):
2665 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002666 import winreg
Brett Cannoncd171c82013-07-04 17:43:24 -04002667 except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002668 # Std modules, so should be around - but you never know!
2669 return 0
2670 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002671 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002672 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002673 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002674 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002675 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002676 'ProxyOverride')[0])
2677 # ^^^^ Returned as Unicode but problems if not converted to ASCII
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002678 except OSError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002679 return 0
2680 if not proxyEnable or not proxyOverride:
2681 return 0
2682 # try to make a host list from name and IP address.
Cheryl Sabella0250de42018-04-25 16:51:54 -07002683 rawHost, port = _splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002684 host = [rawHost]
2685 try:
2686 addr = socket.gethostbyname(rawHost)
2687 if addr != rawHost:
2688 host.append(addr)
Andrew Svetlov0832af62012-12-18 23:10:48 +02002689 except OSError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002690 pass
2691 try:
2692 fqdn = socket.getfqdn(rawHost)
2693 if fqdn != rawHost:
2694 host.append(fqdn)
Andrew Svetlov0832af62012-12-18 23:10:48 +02002695 except OSError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002696 pass
2697 # make a check value list from the registry entry: replace the
2698 # '<local>' string by the localhost entry and the corresponding
2699 # canonical entry.
2700 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002701 # now check if we match one of the registry values.
2702 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002703 if test == '<local>':
2704 if '.' not in rawHost:
2705 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002706 test = test.replace(".", r"\.") # mask dots
2707 test = test.replace("*", r".*") # change glob sequence
2708 test = test.replace("?", r".") # change glob char
2709 for val in host:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002710 if re.match(test, val, re.I):
2711 return 1
2712 return 0
2713
2714 def proxy_bypass(host):
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002715 """Return True, if host should be bypassed.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002716
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002717 Checks proxy settings gathered from the environment, if specified,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002718 or the registry.
2719
2720 """
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002721 proxies = getproxies_environment()
2722 if proxies:
2723 return proxy_bypass_environment(host, proxies)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002724 else:
2725 return proxy_bypass_registry(host)
2726
2727else:
2728 # By default use environment variables
2729 getproxies = getproxies_environment
2730 proxy_bypass = proxy_bypass_environment