blob: 30bf6e051ec6e97c5d9dc550c045db06d3cc7c88 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
Andrew Svetlovf7a17b42012-12-25 16:47:37 +020021OSError); for HTTP errors, raises an HTTPError, which can also be
Jeremy Hylton1afc1692008-06-18 20:49:58 +000022treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
Jeremy Hylton1afc1692008-06-18 20:49:58 +000092import re
93import socket
Martin Pantere6f06092016-05-16 01:14:20 +000094import string
Jeremy Hylton1afc1692008-06-18 20:49:58 +000095import sys
96import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000097import collections
Senthil Kumarane24f96a2012-03-13 19:29:33 -070098import tempfile
99import contextlib
Senthil Kumaran38b968b92012-03-14 13:43:53 -0700100import warnings
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700101
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000102
Georg Brandl13e89462008-07-01 19:56:00 +0000103from urllib.error import URLError, HTTPError, ContentTooShortError
104from urllib.parse import (
105 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
106 splittype, splithost, splitport, splituser, splitpasswd,
Antoine Pitroudf204be2012-11-24 17:59:08 +0100107 splitattr, splitquery, splitvalue, splittag, to_bytes,
108 unquote_to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000109from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110
111# check for SSL
112try:
113 import ssl
Brett Cannoncd171c82013-07-04 17:43:24 -0400114except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000115 _have_ssl = False
116else:
117 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000118
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800119__all__ = [
120 # Classes
121 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
122 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
123 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
R David Murray4c7f9952015-04-16 16:36:18 -0400124 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
125 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
126 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
127 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800128 'UnknownHandler', 'HTTPErrorProcessor',
129 # Functions
130 'urlopen', 'install_opener', 'build_opener',
131 'pathname2url', 'url2pathname', 'getproxies',
132 # Legacy interface
133 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
134]
135
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000136# used in User-Agent header sent
Serhiy Storchaka885bdc42016-02-11 13:10:36 +0200137__version__ = '%d.%d' % sys.version_info[:2]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000138
139_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000140def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
Senthil Kumarana5c85b32014-09-19 15:23:30 +0800141 *, cafile=None, capath=None, cadefault=False, context=None):
Raymond Hettinger507343a2015-08-18 00:35:52 -0700142 '''Open the URL url, which can be either a string or a Request object.
143
Martin Panter3c0d0ba2016-08-24 06:33:33 +0000144 *data* must be an object specifying additional data to be sent to
145 the server, or None if no such data is needed. See Request for
146 details.
Raymond Hettinger507343a2015-08-18 00:35:52 -0700147
148 urllib.request module uses HTTP/1.1 and includes a "Connection:close"
149 header in its HTTP requests.
150
151 The optional *timeout* parameter specifies a timeout in seconds for
152 blocking operations like the connection attempt (if not specified, the
153 global default timeout setting will be used). This only works for HTTP,
154 HTTPS and FTP connections.
155
156 If *context* is specified, it must be a ssl.SSLContext instance describing
157 the various SSL options. See HTTPSConnection for more details.
158
159 The optional *cafile* and *capath* parameters specify a set of trusted CA
160 certificates for HTTPS requests. cafile should point to a single file
161 containing a bundle of CA certificates, whereas capath should point to a
162 directory of hashed certificate files. More information can be found in
163 ssl.SSLContext.load_verify_locations().
164
165 The *cadefault* parameter is ignored.
166
Martin Panter29f256902016-06-04 05:06:34 +0000167 This function always returns an object which can work as a context
168 manager and has methods such as
Raymond Hettinger507343a2015-08-18 00:35:52 -0700169
Serhiy Storchaka3fd4a732015-12-18 13:10:37 +0200170 * geturl() - return the URL of the resource retrieved, commonly used to
Raymond Hettinger507343a2015-08-18 00:35:52 -0700171 determine if a redirect was followed
172
Serhiy Storchaka3fd4a732015-12-18 13:10:37 +0200173 * info() - return the meta-information of the page, such as headers, in the
Raymond Hettinger507343a2015-08-18 00:35:52 -0700174 form of an email.message_from_string() instance (see Quick Reference to
175 HTTP Headers)
176
Serhiy Storchaka3fd4a732015-12-18 13:10:37 +0200177 * getcode() - return the HTTP status code of the response. Raises URLError
Raymond Hettinger507343a2015-08-18 00:35:52 -0700178 on errors.
179
Martin Panter29f256902016-06-04 05:06:34 +0000180 For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
181 object slightly modified. In addition to the three new methods above, the
182 msg attribute contains the same information as the reason attribute ---
183 the reason phrase returned by the server --- instead of the response
184 headers as it is specified in the documentation for HTTPResponse.
R David Murrayd2367c62016-06-03 20:16:06 -0400185
Martin Panter29f256902016-06-04 05:06:34 +0000186 For FTP, file, and data URLs and requests explicitly handled by legacy
187 URLopener and FancyURLopener classes, this function returns a
188 urllib.response.addinfourl object.
189
190 Note that None may be returned if no handler handles the request (though
Raymond Hettinger507343a2015-08-18 00:35:52 -0700191 the default installed global OpenerDirector uses UnknownHandler to ensure
192 this never happens).
193
194 In addition, if proxy settings are detected (for example, when a *_proxy
195 environment variable like http_proxy is set), ProxyHandler is default
196 installed and makes sure the requests are handled through the proxy.
197
198 '''
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000199 global _opener
Antoine Pitroude9ac6c2012-05-16 21:40:01 +0200200 if cafile or capath or cadefault:
Senthil Kumarana5c85b32014-09-19 15:23:30 +0800201 if context is not None:
202 raise ValueError(
203 "You can't pass both context and any of cafile, capath, and "
204 "cadefault"
205 )
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000206 if not _have_ssl:
207 raise ValueError('SSL support not available')
Benjamin Petersonb6666972014-12-07 13:46:02 -0500208 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
Christian Heimes67986f92013-11-23 22:43:47 +0100209 cafile=cafile,
210 capath=capath)
Benjamin Petersonb6666972014-12-07 13:46:02 -0500211 https_handler = HTTPSHandler(context=context)
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000212 opener = build_opener(https_handler)
Senthil Kumarana5c85b32014-09-19 15:23:30 +0800213 elif context:
214 https_handler = HTTPSHandler(context=context)
215 opener = build_opener(https_handler)
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000216 elif _opener is None:
217 _opener = opener = build_opener()
218 else:
219 opener = _opener
220 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000221
222def install_opener(opener):
223 global _opener
224 _opener = opener
225
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700226_url_tempfiles = []
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000227def urlretrieve(url, filename=None, reporthook=None, data=None):
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700228 """
229 Retrieve a URL into a temporary location on disk.
230
231 Requires a URL argument. If a filename is passed, it is used as
232 the temporary file location. The reporthook argument should be
233 a callable that accepts a block number, a read size, and the
234 total file size of the URL target. The data argument should be
235 valid URL encoded data.
236
237 If a filename is passed and the URL points to a local resource,
238 the result is a copy from local file to new file.
239
240 Returns a tuple containing the path to the newly created
241 data file as well as the resulting HTTPMessage object.
242 """
243 url_type, path = splittype(url)
244
245 with contextlib.closing(urlopen(url, data)) as fp:
246 headers = fp.info()
247
248 # Just return the local path and the "headers" for file://
249 # URLs. No sense in performing a copy unless requested.
250 if url_type == "file" and not filename:
251 return os.path.normpath(path), headers
252
253 # Handle temporary file setup.
254 if filename:
255 tfp = open(filename, 'wb')
256 else:
257 tfp = tempfile.NamedTemporaryFile(delete=False)
258 filename = tfp.name
259 _url_tempfiles.append(filename)
260
261 with tfp:
262 result = filename, headers
263 bs = 1024*8
264 size = -1
265 read = 0
266 blocknum = 0
267 if "content-length" in headers:
268 size = int(headers["Content-Length"])
269
270 if reporthook:
Gregory P. Smith6b0bdab2012-11-10 13:43:44 -0800271 reporthook(blocknum, bs, size)
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700272
273 while True:
274 block = fp.read(bs)
275 if not block:
276 break
277 read += len(block)
278 tfp.write(block)
279 blocknum += 1
280 if reporthook:
Gregory P. Smith6b0bdab2012-11-10 13:43:44 -0800281 reporthook(blocknum, bs, size)
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700282
283 if size >= 0 and read < size:
284 raise ContentTooShortError(
285 "retrieval incomplete: got only %i out of %i bytes"
286 % (read, size), result)
287
288 return result
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000289
290def urlcleanup():
Robert Collins2fee5c92015-08-04 12:52:06 +1200291 """Clean up temporary files from urlretrieve calls."""
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700292 for temp_file in _url_tempfiles:
293 try:
294 os.unlink(temp_file)
Andrew Svetlov3438fa42012-12-17 23:35:18 +0200295 except OSError:
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700296 pass
297
298 del _url_tempfiles[:]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000299 global _opener
300 if _opener:
301 _opener = None
302
303# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000304_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000305def request_host(request):
306 """Return request-host, as defined by RFC 2965.
307
308 Variation from RFC: returned value is lowercased, for convenient
309 comparison.
310
311 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000312 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000313 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000314 if host == "":
315 host = request.get_header("Host", "")
316
317 # remove port, if present
318 host = _cut_port_re.sub("", host, 1)
319 return host.lower()
320
321class Request:
322
323 def __init__(self, url, data=None, headers={},
Senthil Kumarande49d642011-10-16 23:54:44 +0800324 origin_req_host=None, unverifiable=False,
325 method=None):
Senthil Kumaran52380922013-04-25 05:45:48 -0700326 self.full_url = url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000327 self.headers = {}
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200328 self.unredirected_hdrs = {}
329 self._data = None
330 self.data = data
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000331 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000332 for key, value in headers.items():
333 self.add_header(key, value)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000334 if origin_req_host is None:
335 origin_req_host = request_host(self)
336 self.origin_req_host = origin_req_host
337 self.unverifiable = unverifiable
Jason R. Coombs7dc4f4b2013-09-08 12:47:07 -0400338 if method:
339 self.method = method
Senthil Kumaran52380922013-04-25 05:45:48 -0700340
341 @property
342 def full_url(self):
Senthil Kumaran83070752013-05-24 09:14:12 -0700343 if self.fragment:
344 return '{}#{}'.format(self._full_url, self.fragment)
Senthil Kumaran52380922013-04-25 05:45:48 -0700345 return self._full_url
346
347 @full_url.setter
348 def full_url(self, url):
349 # unwrap('<URL:type://host/path>') --> 'type://host/path'
350 self._full_url = unwrap(url)
351 self._full_url, self.fragment = splittag(self._full_url)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000352 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000353
Senthil Kumaran52380922013-04-25 05:45:48 -0700354 @full_url.deleter
355 def full_url(self):
356 self._full_url = None
357 self.fragment = None
358 self.selector = ''
359
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200360 @property
361 def data(self):
362 return self._data
363
364 @data.setter
365 def data(self, data):
366 if data != self._data:
367 self._data = data
368 # issue 16464
369 # if we change data we need to remove content-length header
370 # (cause it's most probably calculated for previous value)
371 if self.has_header("Content-length"):
372 self.remove_header("Content-length")
373
374 @data.deleter
375 def data(self):
R David Murray9cc7d452013-03-20 00:10:51 -0400376 self.data = None
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200377
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000378 def _parse(self):
Senthil Kumaran52380922013-04-25 05:45:48 -0700379 self.type, rest = splittype(self._full_url)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000380 if self.type is None:
R David Murrayd8a46962013-04-03 06:58:34 -0400381 raise ValueError("unknown url type: %r" % self.full_url)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000382 self.host, self.selector = splithost(rest)
383 if self.host:
384 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000385
386 def get_method(self):
Senthil Kumarande49d642011-10-16 23:54:44 +0800387 """Return a string indicating the HTTP request method."""
Jason R. Coombsaae6a1d2013-09-08 12:54:33 -0400388 default_method = "POST" if self.data is not None else "GET"
389 return getattr(self, 'method', default_method)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000390
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000391 def get_full_url(self):
Senthil Kumaran52380922013-04-25 05:45:48 -0700392 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000393
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000394 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000395 if self.type == 'https' and not self._tunnel_host:
396 self._tunnel_host = self.host
397 else:
398 self.type= type
399 self.selector = self.full_url
400 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000401
402 def has_proxy(self):
403 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000404
405 def add_header(self, key, val):
406 # useful for something like authentication
407 self.headers[key.capitalize()] = val
408
409 def add_unredirected_header(self, key, val):
410 # will not be added to a redirected request
411 self.unredirected_hdrs[key.capitalize()] = val
412
413 def has_header(self, header_name):
414 return (header_name in self.headers or
415 header_name in self.unredirected_hdrs)
416
417 def get_header(self, header_name, default=None):
418 return self.headers.get(
419 header_name,
420 self.unredirected_hdrs.get(header_name, default))
421
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200422 def remove_header(self, header_name):
423 self.headers.pop(header_name, None)
424 self.unredirected_hdrs.pop(header_name, None)
425
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000426 def header_items(self):
427 hdrs = self.unredirected_hdrs.copy()
428 hdrs.update(self.headers)
429 return list(hdrs.items())
430
431class OpenerDirector:
432 def __init__(self):
433 client_version = "Python-urllib/%s" % __version__
434 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000435 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000436 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000437 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000438 self.handle_open = {}
439 self.handle_error = {}
440 self.process_response = {}
441 self.process_request = {}
442
443 def add_handler(self, handler):
444 if not hasattr(handler, "add_parent"):
445 raise TypeError("expected BaseHandler instance, got %r" %
446 type(handler))
447
448 added = False
449 for meth in dir(handler):
450 if meth in ["redirect_request", "do_open", "proxy_open"]:
451 # oops, coincidental match
452 continue
453
454 i = meth.find("_")
455 protocol = meth[:i]
456 condition = meth[i+1:]
457
458 if condition.startswith("error"):
459 j = condition.find("_") + i + 1
460 kind = meth[j+1:]
461 try:
462 kind = int(kind)
463 except ValueError:
464 pass
465 lookup = self.handle_error.get(protocol, {})
466 self.handle_error[protocol] = lookup
467 elif condition == "open":
468 kind = protocol
469 lookup = self.handle_open
470 elif condition == "response":
471 kind = protocol
472 lookup = self.process_response
473 elif condition == "request":
474 kind = protocol
475 lookup = self.process_request
476 else:
477 continue
478
479 handlers = lookup.setdefault(kind, [])
480 if handlers:
481 bisect.insort(handlers, handler)
482 else:
483 handlers.append(handler)
484 added = True
485
486 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000487 bisect.insort(self.handlers, handler)
488 handler.add_parent(self)
489
490 def close(self):
491 # Only exists for backwards compatibility.
492 pass
493
494 def _call_chain(self, chain, kind, meth_name, *args):
495 # Handlers raise an exception if no one else should try to handle
496 # the request, or return None if they can't but another handler
497 # could. Otherwise, they return the response.
498 handlers = chain.get(kind, ())
499 for handler in handlers:
500 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000501 result = func(*args)
502 if result is not None:
503 return result
504
505 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
506 # accept a URL or a Request object
507 if isinstance(fullurl, str):
508 req = Request(fullurl, data)
509 else:
510 req = fullurl
511 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000512 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000513
514 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516
517 # pre-process request
518 meth_name = protocol+"_request"
519 for processor in self.process_request.get(protocol, []):
520 meth = getattr(processor, meth_name)
521 req = meth(req)
522
523 response = self._open(req, data)
524
525 # post-process response
526 meth_name = protocol+"_response"
527 for processor in self.process_response.get(protocol, []):
528 meth = getattr(processor, meth_name)
529 response = meth(req, response)
530
531 return response
532
533 def _open(self, req, data=None):
534 result = self._call_chain(self.handle_open, 'default',
535 'default_open', req)
536 if result:
537 return result
538
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000539 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000540 result = self._call_chain(self.handle_open, protocol, protocol +
541 '_open', req)
542 if result:
543 return result
544
545 return self._call_chain(self.handle_open, 'unknown',
546 'unknown_open', req)
547
548 def error(self, proto, *args):
549 if proto in ('http', 'https'):
550 # XXX http[s] protocols are special-cased
551 dict = self.handle_error['http'] # https is not different than http
552 proto = args[2] # YUCK!
553 meth_name = 'http_error_%s' % proto
554 http_err = 1
555 orig_args = args
556 else:
557 dict = self.handle_error
558 meth_name = proto + '_error'
559 http_err = 0
560 args = (dict, proto, meth_name) + args
561 result = self._call_chain(*args)
562 if result:
563 return result
564
565 if http_err:
566 args = (dict, 'default', 'http_error_default') + orig_args
567 return self._call_chain(*args)
568
569# XXX probably also want an abstract factory that knows when it makes
570# sense to skip a superclass in favor of a subclass and when it might
571# make sense to include both
572
573def build_opener(*handlers):
574 """Create an opener object from a list of handlers.
575
576 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000577 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000578
579 If any of the handlers passed as arguments are subclasses of the
580 default handlers, the default handlers will not be used.
581 """
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000582 opener = OpenerDirector()
583 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
584 HTTPDefaultErrorHandler, HTTPRedirectHandler,
Antoine Pitroudf204be2012-11-24 17:59:08 +0100585 FTPHandler, FileHandler, HTTPErrorProcessor,
586 DataHandler]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000587 if hasattr(http.client, "HTTPSConnection"):
588 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000589 skip = set()
590 for klass in default_classes:
591 for check in handlers:
Benjamin Peterson78c85382014-04-01 16:27:30 -0400592 if isinstance(check, type):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000593 if issubclass(check, klass):
594 skip.add(klass)
595 elif isinstance(check, klass):
596 skip.add(klass)
597 for klass in skip:
598 default_classes.remove(klass)
599
600 for klass in default_classes:
601 opener.add_handler(klass())
602
603 for h in handlers:
Benjamin Peterson5dd3cae2014-04-01 14:20:56 -0400604 if isinstance(h, type):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000605 h = h()
606 opener.add_handler(h)
607 return opener
608
609class BaseHandler:
610 handler_order = 500
611
612 def add_parent(self, parent):
613 self.parent = parent
614
615 def close(self):
616 # Only exists for backwards compatibility
617 pass
618
619 def __lt__(self, other):
620 if not hasattr(other, "handler_order"):
621 # Try to preserve the old behavior of having custom classes
622 # inserted after default ones (works only for custom user
623 # classes which are not aware of handler_order).
624 return True
625 return self.handler_order < other.handler_order
626
627
628class HTTPErrorProcessor(BaseHandler):
629 """Process HTTP error responses."""
630 handler_order = 1000 # after all other processing
631
632 def http_response(self, request, response):
633 code, msg, hdrs = response.code, response.msg, response.info()
634
635 # According to RFC 2616, "2xx" code indicates that the client's
636 # request was successfully received, understood, and accepted.
637 if not (200 <= code < 300):
638 response = self.parent.error(
639 'http', request, response, code, msg, hdrs)
640
641 return response
642
643 https_response = http_response
644
645class HTTPDefaultErrorHandler(BaseHandler):
646 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000647 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000648
649class HTTPRedirectHandler(BaseHandler):
650 # maximum number of redirections to any single URL
651 # this is needed because of the state that cookies introduce
652 max_repeats = 4
653 # maximum total number of redirections (regardless of URL) before
654 # assuming we're in a loop
655 max_redirections = 10
656
657 def redirect_request(self, req, fp, code, msg, headers, newurl):
658 """Return a Request or None in response to a redirect.
659
660 This is called by the http_error_30x methods when a
661 redirection response is received. If a redirection should
662 take place, return a new Request to allow http_error_30x to
663 perform the redirect. Otherwise, raise HTTPError if no-one
664 else should try to handle this url. Return None if you can't
665 but another Handler might.
666 """
667 m = req.get_method()
668 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
669 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000670 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000671
672 # Strictly (according to RFC 2616), 301 or 302 in response to
673 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000674 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000675 # essentially all clients do redirect in this case, so we do
676 # the same.
Martin Pantere6f06092016-05-16 01:14:20 +0000677
678 # Be conciliant with URIs containing a space. This is mainly
679 # redundant with the more complete encoding done in http_error_302(),
680 # but it is kept for compatibility with other callers.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000681 newurl = newurl.replace(' ', '%20')
Martin Pantere6f06092016-05-16 01:14:20 +0000682
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000683 CONTENT_HEADERS = ("content-length", "content-type")
684 newheaders = dict((k, v) for k, v in req.headers.items()
685 if k.lower() not in CONTENT_HEADERS)
686 return Request(newurl,
687 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000688 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000689 unverifiable=True)
690
691 # Implementation note: To avoid the server sending us into an
692 # infinite loop, the request object needs to track what URLs we
693 # have already seen. Do this by adding a handler-specific
694 # attribute to the Request object.
695 def http_error_302(self, req, fp, code, msg, headers):
696 # Some servers (incorrectly) return multiple Location headers
697 # (so probably same goes for URI). Use first header.
698 if "location" in headers:
699 newurl = headers["location"]
700 elif "uri" in headers:
701 newurl = headers["uri"]
702 else:
703 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000704
705 # fix a possible malformed URL
706 urlparts = urlparse(newurl)
guido@google.coma119df92011-03-29 11:41:02 -0700707
708 # For security reasons we don't allow redirection to anything other
709 # than http, https or ftp.
710
Senthil Kumaran6497aa32012-01-04 13:46:59 +0800711 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800712 raise HTTPError(
713 newurl, code,
714 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
715 headers, fp)
guido@google.coma119df92011-03-29 11:41:02 -0700716
Martin Panterce6e0682016-05-16 01:07:13 +0000717 if not urlparts.path and urlparts.netloc:
Facundo Batistaf24802c2008-08-17 03:36:03 +0000718 urlparts = list(urlparts)
719 urlparts[2] = "/"
720 newurl = urlunparse(urlparts)
721
Martin Pantere6f06092016-05-16 01:14:20 +0000722 # http.client.parse_headers() decodes as ISO-8859-1. Recover the
723 # original bytes and percent-encode non-ASCII bytes, and any special
724 # characters such as the space.
725 newurl = quote(
726 newurl, encoding="iso-8859-1", safe=string.punctuation)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000727 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000728
729 # XXX Probably want to forget about the state of the current
730 # request, although that might interact poorly with other
731 # handlers that also use handler-specific request attributes
732 new = self.redirect_request(req, fp, code, msg, headers, newurl)
733 if new is None:
734 return
735
736 # loop detection
737 # .redirect_dict has a key url if url was previously visited.
738 if hasattr(req, 'redirect_dict'):
739 visited = new.redirect_dict = req.redirect_dict
740 if (visited.get(newurl, 0) >= self.max_repeats or
741 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000742 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000743 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000744 else:
745 visited = new.redirect_dict = req.redirect_dict = {}
746 visited[newurl] = visited.get(newurl, 0) + 1
747
748 # Don't close the fp until we are sure that we won't use it
749 # with HTTPError.
750 fp.read()
751 fp.close()
752
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000753 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000754
755 http_error_301 = http_error_303 = http_error_307 = http_error_302
756
757 inf_msg = "The HTTP server returned a redirect error that would " \
758 "lead to an infinite loop.\n" \
759 "The last 30x error message was:\n"
760
761
762def _parse_proxy(proxy):
763 """Return (scheme, user, password, host/port) given a URL or an authority.
764
765 If a URL is supplied, it must have an authority (host:port) component.
766 According to RFC 3986, having an authority component means the URL must
Senthil Kumarand8e24f12014-04-14 16:32:20 -0400767 have two slashes after the scheme.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000768 """
Georg Brandl13e89462008-07-01 19:56:00 +0000769 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000770 if not r_scheme.startswith("/"):
771 # authority
772 scheme = None
773 authority = proxy
774 else:
775 # URL
776 if not r_scheme.startswith("//"):
777 raise ValueError("proxy URL with no authority: %r" % proxy)
778 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
779 # and 3.3.), path is empty or starts with '/'
780 end = r_scheme.find("/", 2)
781 if end == -1:
782 end = None
783 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000784 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000785 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000786 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000787 else:
788 user = password = None
789 return scheme, user, password, hostport
790
791class ProxyHandler(BaseHandler):
792 # Proxies must be in front
793 handler_order = 100
794
795 def __init__(self, proxies=None):
796 if proxies is None:
797 proxies = getproxies()
798 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
799 self.proxies = proxies
800 for type, url in proxies.items():
801 setattr(self, '%s_open' % type,
Georg Brandlfcbdbf22012-06-24 19:56:31 +0200802 lambda r, proxy=url, type=type, meth=self.proxy_open:
803 meth(r, proxy, type))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000804
805 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000806 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000807 proxy_type, user, password, hostport = _parse_proxy(proxy)
808 if proxy_type is None:
809 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000810
811 if req.host and proxy_bypass(req.host):
812 return None
813
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000814 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000815 user_pass = '%s:%s' % (unquote(user),
816 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000817 creds = base64.b64encode(user_pass.encode()).decode("ascii")
818 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000819 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000820 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000821 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000822 # let other handlers take care of it
823 return None
824 else:
825 # need to start over, because the other handlers don't
826 # grok the proxy's URL type
827 # e.g. if we have a constructor arg proxies like so:
828 # {'http': 'ftp://proxy.example.com'}, we may end up turning
829 # a request for http://acme.example.com/a into one for
830 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000831 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000832
833class HTTPPasswordMgr:
834
835 def __init__(self):
836 self.passwd = {}
837
838 def add_password(self, realm, uri, user, passwd):
839 # uri could be a single URI or a sequence
840 if isinstance(uri, str):
841 uri = [uri]
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800842 if realm not in self.passwd:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000843 self.passwd[realm] = {}
844 for default_port in True, False:
845 reduced_uri = tuple(
846 [self.reduce_uri(u, default_port) for u in uri])
847 self.passwd[realm][reduced_uri] = (user, passwd)
848
849 def find_user_password(self, realm, authuri):
850 domains = self.passwd.get(realm, {})
851 for default_port in True, False:
852 reduced_authuri = self.reduce_uri(authuri, default_port)
853 for uris, authinfo in domains.items():
854 for uri in uris:
855 if self.is_suburi(uri, reduced_authuri):
856 return authinfo
857 return None, None
858
859 def reduce_uri(self, uri, default_port=True):
860 """Accept authority or URI and extract only the authority and path."""
861 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000862 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000863 if parts[1]:
864 # URI
865 scheme = parts[0]
866 authority = parts[1]
867 path = parts[2] or '/'
868 else:
869 # host or host:port
870 scheme = None
871 authority = uri
872 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000873 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000874 if default_port and port is None and scheme is not None:
875 dport = {"http": 80,
876 "https": 443,
877 }.get(scheme)
878 if dport is not None:
879 authority = "%s:%d" % (host, dport)
880 return authority, path
881
882 def is_suburi(self, base, test):
883 """Check if test is below base in a URI tree
884
885 Both args must be URIs in reduced form.
886 """
887 if base == test:
888 return True
889 if base[0] != test[0]:
890 return False
891 common = posixpath.commonprefix((base[1], test[1]))
892 if len(common) == len(base[1]):
893 return True
894 return False
895
896
897class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
898
899 def find_user_password(self, realm, authuri):
900 user, password = HTTPPasswordMgr.find_user_password(self, realm,
901 authuri)
902 if user is not None:
903 return user, password
904 return HTTPPasswordMgr.find_user_password(self, None, authuri)
905
906
R David Murray4c7f9952015-04-16 16:36:18 -0400907class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
908
909 def __init__(self, *args, **kwargs):
910 self.authenticated = {}
911 super().__init__(*args, **kwargs)
912
913 def add_password(self, realm, uri, user, passwd, is_authenticated=False):
914 self.update_authenticated(uri, is_authenticated)
915 # Add a default for prior auth requests
916 if realm is not None:
917 super().add_password(None, uri, user, passwd)
918 super().add_password(realm, uri, user, passwd)
919
920 def update_authenticated(self, uri, is_authenticated=False):
921 # uri could be a single URI or a sequence
922 if isinstance(uri, str):
923 uri = [uri]
924
925 for default_port in True, False:
926 for u in uri:
927 reduced_uri = self.reduce_uri(u, default_port)
928 self.authenticated[reduced_uri] = is_authenticated
929
930 def is_authenticated(self, authuri):
931 for default_port in True, False:
932 reduced_authuri = self.reduce_uri(authuri, default_port)
933 for uri in self.authenticated:
934 if self.is_suburi(uri, reduced_authuri):
935 return self.authenticated[uri]
936
937
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000938class AbstractBasicAuthHandler:
939
940 # XXX this allows for multiple auth-schemes, but will stupidly pick
941 # the last one with a realm specified.
942
943 # allow for double- and single-quoted realm values
944 # (single quotes are a violation of the RFC, but appear in the wild)
945 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
Senthil Kumaran34f3fcc2012-05-15 22:30:25 +0800946 'realm=(["\']?)([^"\']*)\\2', re.I)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000947
948 # XXX could pre-emptively send auth info already accepted (RFC 2617,
949 # end of section 2, and section 1.2 immediately after "credentials"
950 # production).
951
952 def __init__(self, password_mgr=None):
953 if password_mgr is None:
954 password_mgr = HTTPPasswordMgr()
955 self.passwd = password_mgr
956 self.add_password = self.passwd.add_password
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000957
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000958 def http_error_auth_reqed(self, authreq, host, req, headers):
959 # host may be an authority (without userinfo) or a URL with an
960 # authority
961 # XXX could be multiple headers
962 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000963
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000964 if authreq:
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800965 scheme = authreq.split()[0]
Senthil Kumaran1a129c82011-10-20 02:50:13 +0800966 if scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800967 raise ValueError("AbstractBasicAuthHandler does not"
968 " support the following scheme: '%s'" %
969 scheme)
970 else:
971 mo = AbstractBasicAuthHandler.rx.search(authreq)
972 if mo:
973 scheme, quote, realm = mo.groups()
Senthil Kumaran92a5bf02012-05-16 00:03:29 +0800974 if quote not in ['"',"'"]:
975 warnings.warn("Basic Auth Realm was unquoted",
976 UserWarning, 2)
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800977 if scheme.lower() == 'basic':
Senthil Kumaran78373762014-08-20 07:53:58 +0530978 return self.retry_http_basic_auth(host, req, realm)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000979
980 def retry_http_basic_auth(self, host, req, realm):
981 user, pw = self.passwd.find_user_password(realm, host)
982 if pw is not None:
983 raw = "%s:%s" % (user, pw)
984 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
Senthil Kumaran78373762014-08-20 07:53:58 +0530985 if req.get_header(self.auth_header, None) == auth:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000986 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000987 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000988 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000989 else:
990 return None
991
R David Murray4c7f9952015-04-16 16:36:18 -0400992 def http_request(self, req):
993 if (not hasattr(self.passwd, 'is_authenticated') or
994 not self.passwd.is_authenticated(req.full_url)):
995 return req
996
997 if not req.has_header('Authorization'):
998 user, passwd = self.passwd.find_user_password(None, req.full_url)
999 credentials = '{0}:{1}'.format(user, passwd).encode()
1000 auth_str = base64.standard_b64encode(credentials).decode()
1001 req.add_unredirected_header('Authorization',
1002 'Basic {}'.format(auth_str.strip()))
1003 return req
1004
1005 def http_response(self, req, response):
1006 if hasattr(self.passwd, 'is_authenticated'):
1007 if 200 <= response.code < 300:
1008 self.passwd.update_authenticated(req.full_url, True)
1009 else:
1010 self.passwd.update_authenticated(req.full_url, False)
1011 return response
1012
1013 https_request = http_request
1014 https_response = http_response
1015
1016
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001017
1018class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1019
1020 auth_header = 'Authorization'
1021
1022 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001023 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001024 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001025 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001026 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001027
1028
1029class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1030
1031 auth_header = 'Proxy-authorization'
1032
1033 def http_error_407(self, req, fp, code, msg, headers):
1034 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +00001035 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001036 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1037 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001038 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001039 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001040 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001041 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001042
1043
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001044# Return n random bytes.
1045_randombytes = os.urandom
1046
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001047
1048class AbstractDigestAuthHandler:
1049 # Digest authentication is specified in RFC 2617.
1050
1051 # XXX The client does not inspect the Authentication-Info header
1052 # in a successful response.
1053
1054 # XXX It should be possible to test this implementation against
1055 # a mock server that just generates a static set of challenges.
1056
1057 # XXX qop="auth-int" supports is shaky
1058
1059 def __init__(self, passwd=None):
1060 if passwd is None:
1061 passwd = HTTPPasswordMgr()
1062 self.passwd = passwd
1063 self.add_password = self.passwd.add_password
1064 self.retried = 0
1065 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +00001066 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001067
1068 def reset_retry_count(self):
1069 self.retried = 0
1070
1071 def http_error_auth_reqed(self, auth_header, host, req, headers):
1072 authreq = headers.get(auth_header, None)
1073 if self.retried > 5:
1074 # Don't fail endlessly - if we failed once, we'll probably
1075 # fail a second time. Hm. Unless the Password Manager is
1076 # prompting for the information. Crap. This isn't great
1077 # but it's better than the current 'repeat until recursion
1078 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001079 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +00001080 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001081 else:
1082 self.retried += 1
1083 if authreq:
1084 scheme = authreq.split()[0]
1085 if scheme.lower() == 'digest':
1086 return self.retry_http_digest_auth(req, authreq)
Senthil Kumaran1a129c82011-10-20 02:50:13 +08001087 elif scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +08001088 raise ValueError("AbstractDigestAuthHandler does not support"
1089 " the following scheme: '%s'" % scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001090
1091 def retry_http_digest_auth(self, req, auth):
1092 token, challenge = auth.split(' ', 1)
1093 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1094 auth = self.get_authorization(req, chal)
1095 if auth:
1096 auth_val = 'Digest %s' % auth
1097 if req.headers.get(self.auth_header, None) == auth_val:
1098 return None
1099 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +00001100 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001101 return resp
1102
1103 def get_cnonce(self, nonce):
1104 # The cnonce-value is an opaque
1105 # quoted string value provided by the client and used by both client
1106 # and server to avoid chosen plaintext attacks, to provide mutual
1107 # authentication, and to provide some message integrity protection.
1108 # This isn't a fabulous effort, but it's probably Good Enough.
1109 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001110 b = s.encode("ascii") + _randombytes(8)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001111 dig = hashlib.sha1(b).hexdigest()
1112 return dig[:16]
1113
1114 def get_authorization(self, req, chal):
1115 try:
1116 realm = chal['realm']
1117 nonce = chal['nonce']
1118 qop = chal.get('qop')
1119 algorithm = chal.get('algorithm', 'MD5')
1120 # mod_digest doesn't send an opaque, even though it isn't
1121 # supposed to be optional
1122 opaque = chal.get('opaque', None)
1123 except KeyError:
1124 return None
1125
1126 H, KD = self.get_algorithm_impls(algorithm)
1127 if H is None:
1128 return None
1129
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001130 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001131 if user is None:
1132 return None
1133
1134 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001135 if req.data is not None:
1136 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001137 else:
1138 entdig = None
1139
1140 A1 = "%s:%s:%s" % (user, realm, pw)
1141 A2 = "%s:%s" % (req.get_method(),
1142 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001143 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001144 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +00001145 if nonce == self.last_nonce:
1146 self.nonce_count += 1
1147 else:
1148 self.nonce_count = 1
1149 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001150 ncvalue = '%08x' % self.nonce_count
1151 cnonce = self.get_cnonce(nonce)
1152 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1153 respdig = KD(H(A1), noncebit)
1154 elif qop is None:
1155 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1156 else:
1157 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +00001158 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001159
1160 # XXX should the partial digests be encoded too?
1161
1162 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001163 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001164 respdig)
1165 if opaque:
1166 base += ', opaque="%s"' % opaque
1167 if entdig:
1168 base += ', digest="%s"' % entdig
1169 base += ', algorithm="%s"' % algorithm
1170 if qop:
1171 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1172 return base
1173
1174 def get_algorithm_impls(self, algorithm):
1175 # lambdas assume digest modules are imported at the top level
1176 if algorithm == 'MD5':
1177 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1178 elif algorithm == 'SHA':
1179 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1180 # XXX MD5-sess
Berker Peksage88dd1c2016-03-06 16:16:40 +02001181 else:
1182 raise ValueError("Unsupported digest authentication "
1183 "algorithm %r" % algorithm)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001184 KD = lambda s, d: H("%s:%s" % (s, d))
1185 return H, KD
1186
1187 def get_entity_digest(self, data, chal):
1188 # XXX not implemented yet
1189 return None
1190
1191
1192class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1193 """An authentication protocol defined by RFC 2069
1194
1195 Digest authentication improves on basic authentication because it
1196 does not transmit passwords in the clear.
1197 """
1198
1199 auth_header = 'Authorization'
1200 handler_order = 490 # before Basic auth
1201
1202 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001203 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001204 retry = self.http_error_auth_reqed('www-authenticate',
1205 host, req, headers)
1206 self.reset_retry_count()
1207 return retry
1208
1209
1210class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1211
1212 auth_header = 'Proxy-Authorization'
1213 handler_order = 490 # before Basic auth
1214
1215 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001216 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001217 retry = self.http_error_auth_reqed('proxy-authenticate',
1218 host, req, headers)
1219 self.reset_retry_count()
1220 return retry
1221
1222class AbstractHTTPHandler(BaseHandler):
1223
1224 def __init__(self, debuglevel=0):
1225 self._debuglevel = debuglevel
1226
1227 def set_http_debuglevel(self, level):
1228 self._debuglevel = level
1229
Martin Panter3c0d0ba2016-08-24 06:33:33 +00001230 def _get_content_length(self, request):
1231 return http.client.HTTPConnection._get_content_length(
1232 request.data,
1233 request.get_method())
1234
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001235 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001236 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001237 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001238 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001239
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001240 if request.data is not None: # POST
1241 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001242 if isinstance(data, str):
Martin Panter3c0d0ba2016-08-24 06:33:33 +00001243 msg = "POST data should be bytes, an iterable of bytes, " \
1244 "or a file object. It cannot be of type str."
Senthil Kumaran6b3434a2012-03-15 18:11:16 -07001245 raise TypeError(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001246 if not request.has_header('Content-type'):
1247 request.add_unredirected_header(
1248 'Content-type',
1249 'application/x-www-form-urlencoded')
Martin Panter3c0d0ba2016-08-24 06:33:33 +00001250 if (not request.has_header('Content-length')
1251 and not request.has_header('Transfer-encoding')):
1252 content_length = self._get_content_length(request)
1253 if content_length is not None:
1254 request.add_unredirected_header(
1255 'Content-length', str(content_length))
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001256 else:
1257 request.add_unredirected_header(
Martin Panter3c0d0ba2016-08-24 06:33:33 +00001258 'Transfer-encoding', 'chunked')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001259
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001260 sel_host = host
1261 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001262 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001263 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001264 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001265 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001266 for name, value in self.parent.addheaders:
1267 name = name.capitalize()
1268 if not request.has_header(name):
1269 request.add_unredirected_header(name, value)
1270
1271 return request
1272
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001273 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001274 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001275
1276 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001277 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001278 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001279 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001280 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001281
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001282 # will parse host:port
1283 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran9642eed2016-05-13 01:32:42 -07001284 h.set_debuglevel(self._debuglevel)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001285
1286 headers = dict(req.unredirected_hdrs)
1287 headers.update(dict((k, v) for k, v in req.headers.items()
1288 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001289
1290 # TODO(jhylton): Should this be redesigned to handle
1291 # persistent connections?
1292
1293 # We want to make an HTTP/1.1 request, but the addinfourl
1294 # class isn't prepared to deal with a persistent connection.
1295 # It will try to read all remaining data from the socket,
1296 # which will block while the server waits for the next request.
1297 # So make sure the connection gets closed after the (only)
1298 # request.
1299 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001300 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001301
1302 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001303 tunnel_headers = {}
1304 proxy_auth_hdr = "Proxy-Authorization"
1305 if proxy_auth_hdr in headers:
1306 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1307 # Proxy-Authorization should not be sent to origin
1308 # server.
1309 del headers[proxy_auth_hdr]
1310 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001311
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001312 try:
Serhiy Storchakaf54c3502014-09-06 21:41:39 +03001313 try:
Martin Panter3c0d0ba2016-08-24 06:33:33 +00001314 h.request(req.get_method(), req.selector, req.data, headers,
1315 encode_chunked=req.has_header('Transfer-encoding'))
Serhiy Storchakaf54c3502014-09-06 21:41:39 +03001316 except OSError as err: # timeout error
1317 raise URLError(err)
Senthil Kumaran45686b42011-07-27 09:31:03 +08001318 r = h.getresponse()
Serhiy Storchakaf54c3502014-09-06 21:41:39 +03001319 except:
1320 h.close()
1321 raise
1322
1323 # If the server does not send us a 'Connection: close' header,
1324 # HTTPConnection assumes the socket should be left open. Manually
1325 # mark the socket to be closed when this response object goes away.
1326 if h.sock:
1327 h.sock.close()
1328 h.sock = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001329
Senthil Kumaran26430412011-04-13 07:01:19 +08001330 r.url = req.get_full_url()
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001331 # This line replaces the .msg attribute of the HTTPResponse
1332 # with .headers, because urllib clients expect the response to
1333 # have the reason in .msg. It would be good to mark this
1334 # attribute is deprecated and get then to use info() or
1335 # .headers.
1336 r.msg = r.reason
1337 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001338
1339
1340class HTTPHandler(AbstractHTTPHandler):
1341
1342 def http_open(self, req):
1343 return self.do_open(http.client.HTTPConnection, req)
1344
1345 http_request = AbstractHTTPHandler.do_request_
1346
1347if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001348
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001349 class HTTPSHandler(AbstractHTTPHandler):
1350
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001351 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1352 AbstractHTTPHandler.__init__(self, debuglevel)
1353 self._context = context
1354 self._check_hostname = check_hostname
1355
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001356 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001357 return self.do_open(http.client.HTTPSConnection, req,
1358 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001359
1360 https_request = AbstractHTTPHandler.do_request_
1361
Senthil Kumaran4c875a92011-11-01 23:57:57 +08001362 __all__.append('HTTPSHandler')
Senthil Kumaran0d54eb92011-11-01 23:49:46 +08001363
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001364class HTTPCookieProcessor(BaseHandler):
1365 def __init__(self, cookiejar=None):
1366 import http.cookiejar
1367 if cookiejar is None:
1368 cookiejar = http.cookiejar.CookieJar()
1369 self.cookiejar = cookiejar
1370
1371 def http_request(self, request):
1372 self.cookiejar.add_cookie_header(request)
1373 return request
1374
1375 def http_response(self, request, response):
1376 self.cookiejar.extract_cookies(response, request)
1377 return response
1378
1379 https_request = http_request
1380 https_response = http_response
1381
1382class UnknownHandler(BaseHandler):
1383 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001384 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001385 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001386
1387def parse_keqv_list(l):
1388 """Parse list of key=value strings where keys are not duplicated."""
1389 parsed = {}
1390 for elt in l:
1391 k, v = elt.split('=', 1)
1392 if v[0] == '"' and v[-1] == '"':
1393 v = v[1:-1]
1394 parsed[k] = v
1395 return parsed
1396
1397def parse_http_list(s):
1398 """Parse lists as described by RFC 2068 Section 2.
1399
1400 In particular, parse comma-separated lists where the elements of
1401 the list may include quoted-strings. A quoted-string could
1402 contain a comma. A non-quoted string could have quotes in the
1403 middle. Neither commas nor quotes count if they are escaped.
1404 Only double-quotes count, not single-quotes.
1405 """
1406 res = []
1407 part = ''
1408
1409 escape = quote = False
1410 for cur in s:
1411 if escape:
1412 part += cur
1413 escape = False
1414 continue
1415 if quote:
1416 if cur == '\\':
1417 escape = True
1418 continue
1419 elif cur == '"':
1420 quote = False
1421 part += cur
1422 continue
1423
1424 if cur == ',':
1425 res.append(part)
1426 part = ''
1427 continue
1428
1429 if cur == '"':
1430 quote = True
1431
1432 part += cur
1433
1434 # append last part
1435 if part:
1436 res.append(part)
1437
1438 return [part.strip() for part in res]
1439
1440class FileHandler(BaseHandler):
1441 # Use local file or FTP depending on form of URL
1442 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001443 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001444 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1445 req.host != 'localhost'):
Senthil Kumaranbc07ac52014-07-22 00:15:20 -07001446 if not req.host in self.get_names():
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001447 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001448 else:
1449 return self.open_local_file(req)
1450
1451 # names for the localhost
1452 names = None
1453 def get_names(self):
1454 if FileHandler.names is None:
1455 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001456 FileHandler.names = tuple(
1457 socket.gethostbyname_ex('localhost')[2] +
1458 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001459 except socket.gaierror:
1460 FileHandler.names = (socket.gethostbyname('localhost'),)
1461 return FileHandler.names
1462
1463 # not entirely sure what the rules are here
1464 def open_local_file(self, req):
1465 import email.utils
1466 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001467 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001468 filename = req.selector
1469 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001470 try:
1471 stats = os.stat(localfile)
1472 size = stats.st_size
1473 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001474 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001475 headers = email.message_from_string(
1476 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1477 (mtype or 'text/plain', size, modified))
1478 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001479 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001480 if not host or \
1481 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001482 if host:
1483 origurl = 'file://' + host + filename
1484 else:
1485 origurl = 'file://' + filename
1486 return addinfourl(open(localfile, 'rb'), headers, origurl)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001487 except OSError as exp:
Georg Brandl029986a2008-06-23 11:44:14 +00001488 # users shouldn't expect OSErrors coming from urlopen()
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001489 raise URLError(exp)
Georg Brandl13e89462008-07-01 19:56:00 +00001490 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001491
1492def _safe_gethostbyname(host):
1493 try:
1494 return socket.gethostbyname(host)
1495 except socket.gaierror:
1496 return None
1497
1498class FTPHandler(BaseHandler):
1499 def ftp_open(self, req):
1500 import ftplib
1501 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001502 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001503 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001504 raise URLError('ftp error: no host given')
1505 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001506 if port is None:
1507 port = ftplib.FTP_PORT
1508 else:
1509 port = int(port)
1510
1511 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001512 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001513 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001514 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001515 else:
1516 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001517 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001518 user = user or ''
1519 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001520
1521 try:
1522 host = socket.gethostbyname(host)
Andrew Svetlov0832af62012-12-18 23:10:48 +02001523 except OSError as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001524 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001525 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001526 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001527 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001528 dirs, file = dirs[:-1], dirs[-1]
1529 if dirs and not dirs[0]:
1530 dirs = dirs[1:]
1531 try:
1532 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1533 type = file and 'I' or 'D'
1534 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001535 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001536 if attr.lower() == 'type' and \
1537 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1538 type = value.upper()
1539 fp, retrlen = fw.retrfile(file, type)
1540 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001541 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001542 if mtype:
1543 headers += "Content-type: %s\n" % mtype
1544 if retrlen is not None and retrlen >= 0:
1545 headers += "Content-length: %d\n" % retrlen
1546 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001547 return addinfourl(fp, headers, req.full_url)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001548 except ftplib.all_errors as exp:
1549 exc = URLError('ftp error: %r' % exp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001550 raise exc.with_traceback(sys.exc_info()[2])
1551
1552 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001553 return ftpwrapper(user, passwd, host, port, dirs, timeout,
1554 persistent=False)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001555
1556class CacheFTPHandler(FTPHandler):
1557 # XXX would be nice to have pluggable cache strategies
1558 # XXX this stuff is definitely not thread safe
1559 def __init__(self):
1560 self.cache = {}
1561 self.timeout = {}
1562 self.soonest = 0
1563 self.delay = 60
1564 self.max_conns = 16
1565
1566 def setTimeout(self, t):
1567 self.delay = t
1568
1569 def setMaxConns(self, m):
1570 self.max_conns = m
1571
1572 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1573 key = user, host, port, '/'.join(dirs), timeout
1574 if key in self.cache:
1575 self.timeout[key] = time.time() + self.delay
1576 else:
1577 self.cache[key] = ftpwrapper(user, passwd, host, port,
1578 dirs, timeout)
1579 self.timeout[key] = time.time() + self.delay
1580 self.check_cache()
1581 return self.cache[key]
1582
1583 def check_cache(self):
1584 # first check for old ones
1585 t = time.time()
1586 if self.soonest <= t:
1587 for k, v in list(self.timeout.items()):
1588 if v < t:
1589 self.cache[k].close()
1590 del self.cache[k]
1591 del self.timeout[k]
1592 self.soonest = min(list(self.timeout.values()))
1593
1594 # then check the size
1595 if len(self.cache) == self.max_conns:
1596 for k, v in list(self.timeout.items()):
1597 if v == self.soonest:
1598 del self.cache[k]
1599 del self.timeout[k]
1600 break
1601 self.soonest = min(list(self.timeout.values()))
1602
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001603 def clear_cache(self):
1604 for conn in self.cache.values():
1605 conn.close()
1606 self.cache.clear()
1607 self.timeout.clear()
1608
Antoine Pitroudf204be2012-11-24 17:59:08 +01001609class DataHandler(BaseHandler):
1610 def data_open(self, req):
1611 # data URLs as specified in RFC 2397.
1612 #
1613 # ignores POSTed data
1614 #
1615 # syntax:
1616 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1617 # mediatype := [ type "/" subtype ] *( ";" parameter )
1618 # data := *urlchar
1619 # parameter := attribute "=" value
1620 url = req.full_url
1621
1622 scheme, data = url.split(":",1)
1623 mediatype, data = data.split(",",1)
1624
1625 # even base64 encoded data URLs might be quoted so unquote in any case:
1626 data = unquote_to_bytes(data)
1627 if mediatype.endswith(";base64"):
1628 data = base64.decodebytes(data)
1629 mediatype = mediatype[:-7]
1630
1631 if not mediatype:
1632 mediatype = "text/plain;charset=US-ASCII"
1633
1634 headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1635 (mediatype, len(data)))
1636
1637 return addinfourl(io.BytesIO(data), headers, url)
1638
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001639
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001640# Code move from the old urllib module
1641
1642MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1643
1644# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001645if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001646 from nturl2path import url2pathname, pathname2url
1647else:
1648 def url2pathname(pathname):
1649 """OS-specific conversion from a relative URL of the 'file' scheme
1650 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001651 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001652
1653 def pathname2url(pathname):
1654 """OS-specific conversion from a file system path to a relative URL
1655 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001656 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001657
1658# This really consists of two pieces:
1659# (1) a class which handles opening of all sorts of URLs
1660# (plus assorted utilities etc.)
1661# (2) a set of functions for parsing URLs
1662# XXX Should these be separated out into different modules?
1663
1664
1665ftpcache = {}
1666class URLopener:
1667 """Class to open URLs.
1668 This is a class rather than just a subroutine because we may need
1669 more than one set of global protocol-specific options.
1670 Note -- this is a base class for those who don't want the
1671 automatic handling of errors type 302 (relocated) and 401
1672 (authorization needed)."""
1673
1674 __tempfiles = None
1675
1676 version = "Python-urllib/%s" % __version__
1677
1678 # Constructor
1679 def __init__(self, proxies=None, **x509):
Georg Brandlfcbdbf22012-06-24 19:56:31 +02001680 msg = "%(class)s style of invoking requests is deprecated. " \
Senthil Kumaran38b968b92012-03-14 13:43:53 -07001681 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1682 warnings.warn(msg, DeprecationWarning, stacklevel=3)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001683 if proxies is None:
1684 proxies = getproxies()
1685 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1686 self.proxies = proxies
1687 self.key_file = x509.get('key_file')
1688 self.cert_file = x509.get('cert_file')
1689 self.addheaders = [('User-Agent', self.version)]
1690 self.__tempfiles = []
1691 self.__unlink = os.unlink # See cleanup()
1692 self.tempcache = None
1693 # Undocumented feature: if you assign {} to tempcache,
1694 # it is used to cache files retrieved with
1695 # self.retrieve(). This is not enabled by default
1696 # since it does not work for changing documents (and I
1697 # haven't got the logic to check expiration headers
1698 # yet).
1699 self.ftpcache = ftpcache
1700 # Undocumented feature: you can use a different
1701 # ftp cache by assigning to the .ftpcache member;
1702 # in case you want logically independent URL openers
1703 # XXX This is not threadsafe. Bah.
1704
1705 def __del__(self):
1706 self.close()
1707
1708 def close(self):
1709 self.cleanup()
1710
1711 def cleanup(self):
1712 # This code sometimes runs when the rest of this module
1713 # has already been deleted, so it can't use any globals
1714 # or import anything.
1715 if self.__tempfiles:
1716 for file in self.__tempfiles:
1717 try:
1718 self.__unlink(file)
1719 except OSError:
1720 pass
1721 del self.__tempfiles[:]
1722 if self.tempcache:
1723 self.tempcache.clear()
1724
1725 def addheader(self, *args):
1726 """Add a header to be used by the HTTP interface only
1727 e.g. u.addheader('Accept', 'sound/basic')"""
1728 self.addheaders.append(args)
1729
1730 # External interface
1731 def open(self, fullurl, data=None):
1732 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001733 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001734 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001735 if self.tempcache and fullurl in self.tempcache:
1736 filename, headers = self.tempcache[fullurl]
1737 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001738 return addinfourl(fp, headers, fullurl)
1739 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001740 if not urltype:
1741 urltype = 'file'
1742 if urltype in self.proxies:
1743 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001744 urltype, proxyhost = splittype(proxy)
1745 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001746 url = (host, fullurl) # Signal special case to open_*()
1747 else:
1748 proxy = None
1749 name = 'open_' + urltype
1750 self.type = urltype
1751 name = name.replace('-', '_')
1752 if not hasattr(self, name):
1753 if proxy:
1754 return self.open_unknown_proxy(proxy, fullurl, data)
1755 else:
1756 return self.open_unknown(fullurl, data)
1757 try:
1758 if data is None:
1759 return getattr(self, name)(url)
1760 else:
1761 return getattr(self, name)(url, data)
Senthil Kumaranf5776862012-10-21 13:30:02 -07001762 except (HTTPError, URLError):
Antoine Pitrou6b4883d2011-10-12 02:54:14 +02001763 raise
Andrew Svetlov0832af62012-12-18 23:10:48 +02001764 except OSError as msg:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001765 raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001766
1767 def open_unknown(self, fullurl, data=None):
1768 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001769 type, url = splittype(fullurl)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001770 raise OSError('url error', 'unknown url type', type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001771
1772 def open_unknown_proxy(self, proxy, fullurl, data=None):
1773 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001774 type, url = splittype(fullurl)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001775 raise OSError('url error', 'invalid proxy for %s' % type, proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001776
1777 # External interface
1778 def retrieve(self, url, filename=None, reporthook=None, data=None):
1779 """retrieve(url) returns (filename, headers) for a local object
1780 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001781 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001782 if self.tempcache and url in self.tempcache:
1783 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001784 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001785 if filename is None and (not type or type == 'file'):
1786 try:
1787 fp = self.open_local_file(url1)
1788 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001789 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001790 return url2pathname(splithost(url1)[1]), hdrs
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001791 except OSError as msg:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001792 pass
1793 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001794 try:
1795 headers = fp.info()
1796 if filename:
1797 tfp = open(filename, 'wb')
1798 else:
1799 import tempfile
1800 garbage, path = splittype(url)
1801 garbage, path = splithost(path or "")
1802 path, garbage = splitquery(path or "")
1803 path, garbage = splitattr(path or "")
1804 suffix = os.path.splitext(path)[1]
1805 (fd, filename) = tempfile.mkstemp(suffix)
1806 self.__tempfiles.append(filename)
1807 tfp = os.fdopen(fd, 'wb')
1808 try:
1809 result = filename, headers
1810 if self.tempcache is not None:
1811 self.tempcache[url] = result
1812 bs = 1024*8
1813 size = -1
1814 read = 0
1815 blocknum = 0
Senthil Kumarance260142011-11-01 01:35:17 +08001816 if "content-length" in headers:
1817 size = int(headers["Content-Length"])
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001818 if reporthook:
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001819 reporthook(blocknum, bs, size)
1820 while 1:
1821 block = fp.read(bs)
1822 if not block:
1823 break
1824 read += len(block)
1825 tfp.write(block)
1826 blocknum += 1
1827 if reporthook:
1828 reporthook(blocknum, bs, size)
1829 finally:
1830 tfp.close()
1831 finally:
1832 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001833
1834 # raise exception if actual size does not match content-length header
1835 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001836 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001837 "retrieval incomplete: got only %i out of %i bytes"
1838 % (read, size), result)
1839
1840 return result
1841
1842 # Each method named open_<type> knows how to open that type of URL
1843
1844 def _open_generic_http(self, connection_factory, url, data):
1845 """Make an HTTP connection using connection_class.
1846
1847 This is an internal method that should be called from
1848 open_http() or open_https().
1849
1850 Arguments:
1851 - connection_factory should take a host name and return an
1852 HTTPConnection instance.
1853 - url is the url to retrieval or a host, relative-path pair.
1854 - data is payload for a POST request or None.
1855 """
1856
1857 user_passwd = None
1858 proxy_passwd= None
1859 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001860 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001861 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001862 user_passwd, host = splituser(host)
1863 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001864 realhost = host
1865 else:
1866 host, selector = url
1867 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001868 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001869 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001870 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001871 url = rest
1872 user_passwd = None
1873 if urltype.lower() != 'http':
1874 realhost = None
1875 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001876 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001877 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001878 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001879 if user_passwd:
1880 selector = "%s://%s%s" % (urltype, realhost, rest)
1881 if proxy_bypass(realhost):
1882 host = realhost
1883
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001884 if not host: raise OSError('http error', 'no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001885
1886 if proxy_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001887 proxy_passwd = unquote(proxy_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001888 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001889 else:
1890 proxy_auth = None
1891
1892 if user_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001893 user_passwd = unquote(user_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001894 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001895 else:
1896 auth = None
1897 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001898 headers = {}
1899 if proxy_auth:
1900 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1901 if auth:
1902 headers["Authorization"] = "Basic %s" % auth
1903 if realhost:
1904 headers["Host"] = realhost
Senthil Kumarand91ffca2011-03-19 17:25:27 +08001905
1906 # Add Connection:close as we don't support persistent connections yet.
1907 # This helps in closing the socket and avoiding ResourceWarning
1908
1909 headers["Connection"] = "close"
1910
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001911 for header, value in self.addheaders:
1912 headers[header] = value
1913
1914 if data is not None:
1915 headers["Content-Type"] = "application/x-www-form-urlencoded"
1916 http_conn.request("POST", selector, data, headers)
1917 else:
1918 http_conn.request("GET", selector, headers=headers)
1919
1920 try:
1921 response = http_conn.getresponse()
1922 except http.client.BadStatusLine:
1923 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001924 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001925
1926 # According to RFC 2616, "2xx" code indicates that the client's
1927 # request was successfully received, understood, and accepted.
1928 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001929 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001930 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001931 else:
1932 return self.http_error(
1933 url, response.fp,
1934 response.status, response.reason, response.msg, data)
1935
1936 def open_http(self, url, data=None):
1937 """Use HTTP protocol."""
1938 return self._open_generic_http(http.client.HTTPConnection, url, data)
1939
1940 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1941 """Handle http errors.
1942
1943 Derived class can override this, or provide specific handlers
1944 named http_error_DDD where DDD is the 3-digit error code."""
1945 # First check if there's a specific handler for this error
1946 name = 'http_error_%d' % errcode
1947 if hasattr(self, name):
1948 method = getattr(self, name)
1949 if data is None:
1950 result = method(url, fp, errcode, errmsg, headers)
1951 else:
1952 result = method(url, fp, errcode, errmsg, headers, data)
1953 if result: return result
1954 return self.http_error_default(url, fp, errcode, errmsg, headers)
1955
1956 def http_error_default(self, url, fp, errcode, errmsg, headers):
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001957 """Default error handler: close the connection and raise OSError."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001958 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001959 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001960
1961 if _have_ssl:
1962 def _https_connection(self, host):
1963 return http.client.HTTPSConnection(host,
1964 key_file=self.key_file,
1965 cert_file=self.cert_file)
1966
1967 def open_https(self, url, data=None):
1968 """Use HTTPS protocol."""
1969 return self._open_generic_http(self._https_connection, url, data)
1970
1971 def open_file(self, url):
1972 """Use local file or FTP depending on form of URL."""
1973 if not isinstance(url, str):
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001974 raise URLError('file error: proxy support for file protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001975 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001976 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001977 else:
1978 return self.open_local_file(url)
1979
1980 def open_local_file(self, url):
1981 """Use local file."""
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001982 import email.utils
1983 import mimetypes
Georg Brandl13e89462008-07-01 19:56:00 +00001984 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001985 localname = url2pathname(file)
1986 try:
1987 stats = os.stat(localname)
1988 except OSError as e:
Senthil Kumaranf5776862012-10-21 13:30:02 -07001989 raise URLError(e.strerror, e.filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001990 size = stats.st_size
1991 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1992 mtype = mimetypes.guess_type(url)[0]
1993 headers = email.message_from_string(
1994 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1995 (mtype or 'text/plain', size, modified))
1996 if not host:
1997 urlfile = file
1998 if file[:1] == '/':
1999 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00002000 return addinfourl(open(localname, 'rb'), headers, urlfile)
2001 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002002 if (not port
Senthil Kumaran40d80782012-10-22 09:43:04 -07002003 and socket.gethostbyname(host) in ((localhost(),) + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002004 urlfile = file
2005 if file[:1] == '/':
2006 urlfile = 'file://' + file
Senthil Kumaran3800ea92012-01-21 11:52:48 +08002007 elif file[:2] == './':
2008 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
Georg Brandl13e89462008-07-01 19:56:00 +00002009 return addinfourl(open(localname, 'rb'), headers, urlfile)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002010 raise URLError('local file error: not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002011
2012 def open_ftp(self, url):
2013 """Use FTP protocol."""
2014 if not isinstance(url, str):
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002015 raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002016 import mimetypes
Georg Brandl13e89462008-07-01 19:56:00 +00002017 host, path = splithost(url)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002018 if not host: raise URLError('ftp error: no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00002019 host, port = splitport(host)
2020 user, host = splituser(host)
2021 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002022 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00002023 host = unquote(host)
2024 user = unquote(user or '')
2025 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002026 host = socket.gethostbyname(host)
2027 if not port:
2028 import ftplib
2029 port = ftplib.FTP_PORT
2030 else:
2031 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00002032 path, attrs = splitattr(path)
2033 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002034 dirs = path.split('/')
2035 dirs, file = dirs[:-1], dirs[-1]
2036 if dirs and not dirs[0]: dirs = dirs[1:]
2037 if dirs and not dirs[0]: dirs[0] = '/'
2038 key = user, host, port, '/'.join(dirs)
2039 # XXX thread unsafe!
2040 if len(self.ftpcache) > MAXFTPCACHE:
2041 # Prune the cache, rather arbitrarily
Benjamin Peterson3c2dca62014-06-07 15:08:04 -07002042 for k in list(self.ftpcache):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002043 if k != key:
2044 v = self.ftpcache[k]
2045 del self.ftpcache[k]
2046 v.close()
2047 try:
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002048 if key not in self.ftpcache:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002049 self.ftpcache[key] = \
2050 ftpwrapper(user, passwd, host, port, dirs)
2051 if not file: type = 'D'
2052 else: type = 'I'
2053 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00002054 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002055 if attr.lower() == 'type' and \
2056 value in ('a', 'A', 'i', 'I', 'd', 'D'):
2057 type = value.upper()
2058 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2059 mtype = mimetypes.guess_type("ftp:" + url)[0]
2060 headers = ""
2061 if mtype:
2062 headers += "Content-Type: %s\n" % mtype
2063 if retrlen is not None and retrlen >= 0:
2064 headers += "Content-Length: %d\n" % retrlen
2065 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00002066 return addinfourl(fp, headers, "ftp:" + url)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002067 except ftperrors() as exp:
2068 raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002069
2070 def open_data(self, url, data=None):
2071 """Use "data" URL."""
2072 if not isinstance(url, str):
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002073 raise URLError('data error: proxy support for data protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002074 # ignore POSTed data
2075 #
2076 # syntax of data URLs:
2077 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
2078 # mediatype := [ type "/" subtype ] *( ";" parameter )
2079 # data := *urlchar
2080 # parameter := attribute "=" value
2081 try:
2082 [type, data] = url.split(',', 1)
2083 except ValueError:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002084 raise OSError('data error', 'bad data URL')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002085 if not type:
2086 type = 'text/plain;charset=US-ASCII'
2087 semi = type.rfind(';')
2088 if semi >= 0 and '=' not in type[semi:]:
2089 encoding = type[semi+1:]
2090 type = type[:semi]
2091 else:
2092 encoding = ''
2093 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00002094 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002095 time.gmtime(time.time())))
2096 msg.append('Content-type: %s' % type)
2097 if encoding == 'base64':
Georg Brandl706824f2009-06-04 09:42:55 +00002098 # XXX is this encoding/decoding ok?
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002099 data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002100 else:
Georg Brandl13e89462008-07-01 19:56:00 +00002101 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002102 msg.append('Content-Length: %d' % len(data))
2103 msg.append('')
2104 msg.append(data)
2105 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00002106 headers = email.message_from_string(msg)
2107 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002108 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00002109 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002110
2111
2112class FancyURLopener(URLopener):
2113 """Derived class with handlers for errors we can handle (perhaps)."""
2114
2115 def __init__(self, *args, **kwargs):
2116 URLopener.__init__(self, *args, **kwargs)
2117 self.auth_cache = {}
2118 self.tries = 0
2119 self.maxtries = 10
2120
2121 def http_error_default(self, url, fp, errcode, errmsg, headers):
2122 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00002123 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002124
2125 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2126 """Error 302 -- relocated (temporarily)."""
2127 self.tries += 1
Martin Pantera0370222016-02-04 06:01:35 +00002128 try:
2129 if self.maxtries and self.tries >= self.maxtries:
2130 if hasattr(self, "http_error_500"):
2131 meth = self.http_error_500
2132 else:
2133 meth = self.http_error_default
2134 return meth(url, fp, 500,
2135 "Internal Server Error: Redirect Recursion",
2136 headers)
2137 result = self.redirect_internal(url, fp, errcode, errmsg,
2138 headers, data)
2139 return result
2140 finally:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002141 self.tries = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002142
2143 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2144 if 'location' in headers:
2145 newurl = headers['location']
2146 elif 'uri' in headers:
2147 newurl = headers['uri']
2148 else:
2149 return
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002150 fp.close()
guido@google.coma119df92011-03-29 11:41:02 -07002151
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002152 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00002153 newurl = urljoin(self.type + ":" + url, newurl)
guido@google.coma119df92011-03-29 11:41:02 -07002154
2155 urlparts = urlparse(newurl)
2156
2157 # For security reasons, we don't allow redirection to anything other
2158 # than http, https and ftp.
2159
2160 # We are using newer HTTPError with older redirect_internal method
2161 # This older method will get deprecated in 3.3
2162
Senthil Kumaran6497aa32012-01-04 13:46:59 +08002163 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
guido@google.coma119df92011-03-29 11:41:02 -07002164 raise HTTPError(newurl, errcode,
2165 errmsg +
2166 " Redirection to url '%s' is not allowed." % newurl,
2167 headers, fp)
2168
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002169 return self.open(newurl)
2170
2171 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2172 """Error 301 -- also relocated (permanently)."""
2173 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2174
2175 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2176 """Error 303 -- also relocated (essentially identical to 302)."""
2177 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2178
2179 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2180 """Error 307 -- relocated, but turn POST into error."""
2181 if data is None:
2182 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2183 else:
2184 return self.http_error_default(url, fp, errcode, errmsg, headers)
2185
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002186 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2187 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002188 """Error 401 -- authentication required.
2189 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002190 if 'www-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002191 URLopener.http_error_default(self, url, fp,
2192 errcode, errmsg, headers)
2193 stuff = headers['www-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002194 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2195 if not match:
2196 URLopener.http_error_default(self, url, fp,
2197 errcode, errmsg, headers)
2198 scheme, realm = match.groups()
2199 if scheme.lower() != 'basic':
2200 URLopener.http_error_default(self, url, fp,
2201 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002202 if not retry:
2203 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2204 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002205 name = 'retry_' + self.type + '_basic_auth'
2206 if data is None:
2207 return getattr(self,name)(url, realm)
2208 else:
2209 return getattr(self,name)(url, realm, data)
2210
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002211 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2212 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002213 """Error 407 -- proxy authentication required.
2214 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002215 if 'proxy-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002216 URLopener.http_error_default(self, url, fp,
2217 errcode, errmsg, headers)
2218 stuff = headers['proxy-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002219 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2220 if not match:
2221 URLopener.http_error_default(self, url, fp,
2222 errcode, errmsg, headers)
2223 scheme, realm = match.groups()
2224 if scheme.lower() != 'basic':
2225 URLopener.http_error_default(self, url, fp,
2226 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002227 if not retry:
2228 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2229 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002230 name = 'retry_proxy_' + self.type + '_basic_auth'
2231 if data is None:
2232 return getattr(self,name)(url, realm)
2233 else:
2234 return getattr(self,name)(url, realm, data)
2235
2236 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002237 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002238 newurl = 'http://' + host + selector
2239 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00002240 urltype, proxyhost = splittype(proxy)
2241 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002242 i = proxyhost.find('@') + 1
2243 proxyhost = proxyhost[i:]
2244 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2245 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002246 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002247 quote(passwd, safe=''), proxyhost)
2248 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2249 if data is None:
2250 return self.open(newurl)
2251 else:
2252 return self.open(newurl, data)
2253
2254 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002255 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002256 newurl = 'https://' + host + selector
2257 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00002258 urltype, proxyhost = splittype(proxy)
2259 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002260 i = proxyhost.find('@') + 1
2261 proxyhost = proxyhost[i:]
2262 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2263 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002264 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002265 quote(passwd, safe=''), proxyhost)
2266 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2267 if data is None:
2268 return self.open(newurl)
2269 else:
2270 return self.open(newurl, data)
2271
2272 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002273 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002274 i = host.find('@') + 1
2275 host = host[i:]
2276 user, passwd = self.get_user_passwd(host, realm, i)
2277 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002278 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002279 quote(passwd, safe=''), host)
2280 newurl = 'http://' + host + selector
2281 if data is None:
2282 return self.open(newurl)
2283 else:
2284 return self.open(newurl, data)
2285
2286 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002287 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002288 i = host.find('@') + 1
2289 host = host[i:]
2290 user, passwd = self.get_user_passwd(host, realm, i)
2291 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002292 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002293 quote(passwd, safe=''), host)
2294 newurl = 'https://' + host + selector
2295 if data is None:
2296 return self.open(newurl)
2297 else:
2298 return self.open(newurl, data)
2299
Florent Xicluna757445b2010-05-17 17:24:07 +00002300 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002301 key = realm + '@' + host.lower()
2302 if key in self.auth_cache:
2303 if clear_cache:
2304 del self.auth_cache[key]
2305 else:
2306 return self.auth_cache[key]
2307 user, passwd = self.prompt_user_passwd(host, realm)
2308 if user or passwd: self.auth_cache[key] = (user, passwd)
2309 return user, passwd
2310
2311 def prompt_user_passwd(self, host, realm):
2312 """Override this in a GUI environment!"""
2313 import getpass
2314 try:
2315 user = input("Enter username for %s at %s: " % (realm, host))
2316 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2317 (user, realm, host))
2318 return user, passwd
2319 except KeyboardInterrupt:
2320 print()
2321 return None, None
2322
2323
2324# Utility functions
2325
2326_localhost = None
2327def localhost():
2328 """Return the IP address of the magic hostname 'localhost'."""
2329 global _localhost
2330 if _localhost is None:
2331 _localhost = socket.gethostbyname('localhost')
2332 return _localhost
2333
2334_thishost = None
2335def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002336 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002337 global _thishost
2338 if _thishost is None:
Senthil Kumarandcdadfe2013-06-01 11:12:17 -07002339 try:
2340 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2341 except socket.gaierror:
2342 _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002343 return _thishost
2344
2345_ftperrors = None
2346def ftperrors():
2347 """Return the set of errors raised by the FTP class."""
2348 global _ftperrors
2349 if _ftperrors is None:
2350 import ftplib
2351 _ftperrors = ftplib.all_errors
2352 return _ftperrors
2353
2354_noheaders = None
2355def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002356 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002357 global _noheaders
2358 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002359 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002360 return _noheaders
2361
2362
2363# Utility classes
2364
2365class ftpwrapper:
2366 """Class used by open_ftp() for cache of open FTP connections."""
2367
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002368 def __init__(self, user, passwd, host, port, dirs, timeout=None,
2369 persistent=True):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002370 self.user = user
2371 self.passwd = passwd
2372 self.host = host
2373 self.port = port
2374 self.dirs = dirs
2375 self.timeout = timeout
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002376 self.refcount = 0
2377 self.keepalive = persistent
Victor Stinnerab73e652015-04-07 12:49:27 +02002378 try:
2379 self.init()
2380 except:
2381 self.close()
2382 raise
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002383
2384 def init(self):
2385 import ftplib
2386 self.busy = 0
2387 self.ftp = ftplib.FTP()
2388 self.ftp.connect(self.host, self.port, self.timeout)
2389 self.ftp.login(self.user, self.passwd)
Senthil Kumarancaa00fe2013-06-02 11:59:47 -07002390 _target = '/'.join(self.dirs)
2391 self.ftp.cwd(_target)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002392
2393 def retrfile(self, file, type):
2394 import ftplib
2395 self.endtransfer()
2396 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2397 else: cmd = 'TYPE ' + type; isdir = 0
2398 try:
2399 self.ftp.voidcmd(cmd)
2400 except ftplib.all_errors:
2401 self.init()
2402 self.ftp.voidcmd(cmd)
2403 conn = None
2404 if file and not isdir:
2405 # Try to retrieve as a file
2406 try:
2407 cmd = 'RETR ' + file
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002408 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002409 except ftplib.error_perm as reason:
2410 if str(reason)[:3] != '550':
Benjamin Peterson901a2782013-05-12 19:01:52 -05002411 raise URLError('ftp error: %r' % reason).with_traceback(
Georg Brandl13e89462008-07-01 19:56:00 +00002412 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002413 if not conn:
2414 # Set transfer mode to ASCII!
2415 self.ftp.voidcmd('TYPE A')
2416 # Try a directory listing. Verify that directory exists.
2417 if file:
2418 pwd = self.ftp.pwd()
2419 try:
2420 try:
2421 self.ftp.cwd(file)
2422 except ftplib.error_perm as reason:
Benjamin Peterson901a2782013-05-12 19:01:52 -05002423 raise URLError('ftp error: %r' % reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002424 finally:
2425 self.ftp.cwd(pwd)
2426 cmd = 'LIST ' + file
2427 else:
2428 cmd = 'LIST'
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002429 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002430 self.busy = 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002431
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002432 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2433 self.refcount += 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002434 conn.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002435 # Pass back both a suitably decorated object and a retrieval length
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002436 return (ftpobj, retrlen)
2437
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002438 def endtransfer(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002439 self.busy = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002440
2441 def close(self):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002442 self.keepalive = False
2443 if self.refcount <= 0:
2444 self.real_close()
2445
2446 def file_close(self):
2447 self.endtransfer()
2448 self.refcount -= 1
2449 if self.refcount <= 0 and not self.keepalive:
2450 self.real_close()
2451
2452 def real_close(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002453 self.endtransfer()
2454 try:
2455 self.ftp.close()
2456 except ftperrors():
2457 pass
2458
2459# Proxy handling
2460def getproxies_environment():
2461 """Return a dictionary of scheme -> proxy server URL mappings.
2462
2463 Scan the environment for variables named <scheme>_proxy;
2464 this seems to be the standard convention. If you need a
2465 different way, you can pass a proxies dictionary to the
2466 [Fancy]URLopener constructor.
2467
2468 """
2469 proxies = {}
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002470 # in order to prefer lowercase variables, process environment in
2471 # two passes: first matches any, second pass matches lowercase only
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002472 for name, value in os.environ.items():
2473 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002474 if value and name[-6:] == '_proxy':
2475 proxies[name[:-6]] = value
Senthil Kumaran4cbb23f2016-07-30 23:24:16 -07002476 # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
2477 # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
2478 # header from the client
Senthil Kumaran17742f22016-07-30 23:39:06 -07002479 # If "proxy" is lowercase, it will still be used thanks to the next block
Senthil Kumaran4cbb23f2016-07-30 23:24:16 -07002480 if 'REQUEST_METHOD' in os.environ:
2481 proxies.pop('http', None)
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002482 for name, value in os.environ.items():
2483 if name[-6:] == '_proxy':
2484 name = name.lower()
2485 if value:
2486 proxies[name[:-6]] = value
2487 else:
2488 proxies.pop(name[:-6], None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002489 return proxies
2490
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002491def proxy_bypass_environment(host, proxies=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002492 """Test if proxies should not be used for a particular host.
2493
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002494 Checks the proxy dict for the value of no_proxy, which should
2495 be a list of comma separated DNS suffixes, or '*' for all hosts.
2496
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002497 """
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002498 if proxies is None:
2499 proxies = getproxies_environment()
2500 # don't bypass, if no_proxy isn't specified
2501 try:
2502 no_proxy = proxies['no']
2503 except KeyError:
2504 return 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002505 # '*' is special case for always bypass
2506 if no_proxy == '*':
2507 return 1
2508 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002509 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002510 # check if the host ends with any of the DNS suffixes
Senthil Kumaran89976f12011-08-06 12:27:40 +08002511 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2512 for name in no_proxy_list:
Martin Panteraa279822016-04-30 01:03:40 +00002513 if name:
2514 name = re.escape(name)
2515 pattern = r'(.+\.)?%s$' % name
2516 if (re.match(pattern, hostonly, re.I)
2517 or re.match(pattern, host, re.I)):
2518 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002519 # otherwise, don't bypass
2520 return 0
2521
2522
Ronald Oussorene72e1612011-03-14 18:15:25 -04002523# This code tests an OSX specific data structure but is testable on all
2524# platforms
2525def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2526 """
2527 Return True iff this host shouldn't be accessed using a proxy
2528
2529 This function uses the MacOSX framework SystemConfiguration
2530 to fetch the proxy information.
2531
2532 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2533 { 'exclude_simple': bool,
2534 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2535 }
2536 """
Ronald Oussorene72e1612011-03-14 18:15:25 -04002537 from fnmatch import fnmatch
2538
2539 hostonly, port = splitport(host)
2540
2541 def ip2num(ipAddr):
2542 parts = ipAddr.split('.')
2543 parts = list(map(int, parts))
2544 if len(parts) != 4:
2545 parts = (parts + [0, 0, 0, 0])[:4]
2546 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2547
2548 # Check for simple host names:
2549 if '.' not in host:
2550 if proxy_settings['exclude_simple']:
2551 return True
2552
2553 hostIP = None
2554
2555 for value in proxy_settings.get('exceptions', ()):
2556 # Items in the list are strings like these: *.local, 169.254/16
2557 if not value: continue
2558
2559 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2560 if m is not None:
2561 if hostIP is None:
2562 try:
2563 hostIP = socket.gethostbyname(hostonly)
2564 hostIP = ip2num(hostIP)
Andrew Svetlov0832af62012-12-18 23:10:48 +02002565 except OSError:
Ronald Oussorene72e1612011-03-14 18:15:25 -04002566 continue
2567
2568 base = ip2num(m.group(1))
2569 mask = m.group(2)
2570 if mask is None:
2571 mask = 8 * (m.group(1).count('.') + 1)
2572 else:
2573 mask = int(mask[1:])
2574 mask = 32 - mask
2575
2576 if (hostIP >> mask) == (base >> mask):
2577 return True
2578
2579 elif fnmatch(host, value):
2580 return True
2581
2582 return False
2583
2584
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002585if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002586 from _scproxy import _get_proxy_settings, _get_proxies
2587
2588 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren84151202010-04-18 20:46:11 +00002589 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002590 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren84151202010-04-18 20:46:11 +00002591
2592 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002593 """Return a dictionary of scheme -> proxy server URL mappings.
2594
Ronald Oussoren84151202010-04-18 20:46:11 +00002595 This function uses the MacOSX framework SystemConfiguration
2596 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002597 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002598 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002599
Ronald Oussoren84151202010-04-18 20:46:11 +00002600
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002601
2602 def proxy_bypass(host):
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002603 """Return True, if host should be bypassed.
2604
2605 Checks proxy settings gathered from the environment, if specified,
2606 or from the MacOSX framework SystemConfiguration.
2607
2608 """
2609 proxies = getproxies_environment()
2610 if proxies:
2611 return proxy_bypass_environment(host, proxies)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002612 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002613 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002614
2615 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002616 return getproxies_environment() or getproxies_macosx_sysconf()
2617
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002618
2619elif os.name == 'nt':
2620 def getproxies_registry():
2621 """Return a dictionary of scheme -> proxy server URL mappings.
2622
2623 Win32 uses the registry to store proxies.
2624
2625 """
2626 proxies = {}
2627 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002628 import winreg
Brett Cannoncd171c82013-07-04 17:43:24 -04002629 except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002630 # Std module, so should be around - but you never know!
2631 return proxies
2632 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002633 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002634 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002635 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002636 'ProxyEnable')[0]
2637 if proxyEnable:
2638 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002639 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002640 'ProxyServer')[0])
2641 if '=' in proxyServer:
2642 # Per-protocol settings
2643 for p in proxyServer.split(';'):
2644 protocol, address = p.split('=', 1)
2645 # See if address has a type:// prefix
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002646 if not re.match('^([^/:]+)://', address):
2647 address = '%s://%s' % (protocol, address)
2648 proxies[protocol] = address
2649 else:
2650 # Use one setting for all protocols
2651 if proxyServer[:5] == 'http:':
2652 proxies['http'] = proxyServer
2653 else:
2654 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002655 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002656 proxies['ftp'] = 'ftp://%s' % proxyServer
2657 internetSettings.Close()
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002658 except (OSError, ValueError, TypeError):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002659 # Either registry key not found etc, or the value in an
2660 # unexpected format.
2661 # proxies already set up to be empty so nothing to do
2662 pass
2663 return proxies
2664
2665 def getproxies():
2666 """Return a dictionary of scheme -> proxy server URL mappings.
2667
2668 Returns settings gathered from the environment, if specified,
2669 or the registry.
2670
2671 """
2672 return getproxies_environment() or getproxies_registry()
2673
2674 def proxy_bypass_registry(host):
2675 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002676 import winreg
Brett Cannoncd171c82013-07-04 17:43:24 -04002677 except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002678 # Std modules, so should be around - but you never know!
2679 return 0
2680 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002681 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002682 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002683 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002684 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002685 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002686 'ProxyOverride')[0])
2687 # ^^^^ Returned as Unicode but problems if not converted to ASCII
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002688 except OSError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002689 return 0
2690 if not proxyEnable or not proxyOverride:
2691 return 0
2692 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002693 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002694 host = [rawHost]
2695 try:
2696 addr = socket.gethostbyname(rawHost)
2697 if addr != rawHost:
2698 host.append(addr)
Andrew Svetlov0832af62012-12-18 23:10:48 +02002699 except OSError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002700 pass
2701 try:
2702 fqdn = socket.getfqdn(rawHost)
2703 if fqdn != rawHost:
2704 host.append(fqdn)
Andrew Svetlov0832af62012-12-18 23:10:48 +02002705 except OSError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002706 pass
2707 # make a check value list from the registry entry: replace the
2708 # '<local>' string by the localhost entry and the corresponding
2709 # canonical entry.
2710 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002711 # now check if we match one of the registry values.
2712 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002713 if test == '<local>':
2714 if '.' not in rawHost:
2715 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002716 test = test.replace(".", r"\.") # mask dots
2717 test = test.replace("*", r".*") # change glob sequence
2718 test = test.replace("?", r".") # change glob char
2719 for val in host:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002720 if re.match(test, val, re.I):
2721 return 1
2722 return 0
2723
2724 def proxy_bypass(host):
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002725 """Return True, if host should be bypassed.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002726
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002727 Checks proxy settings gathered from the environment, if specified,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002728 or the registry.
2729
2730 """
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002731 proxies = getproxies_environment()
2732 if proxies:
2733 return proxy_bypass_environment(host, proxies)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002734 else:
2735 return proxy_bypass_registry(host)
2736
2737else:
2738 # By default use environment variables
2739 getproxies = getproxies_environment
2740 proxy_bypass = proxy_bypass_environment