blob: 67e73f9ef855a4abf5345cf00833fe26adfafb5b [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
Andrew Svetlovf7a17b42012-12-25 16:47:37 +020021OSError); for HTTP errors, raises an HTTPError, which can also be
Jeremy Hylton1afc1692008-06-18 20:49:58 +000022treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
Jeremy Hylton1afc1692008-06-18 20:49:58 +000092import re
93import socket
Martin Pantere6f06092016-05-16 01:14:20 +000094import string
Jeremy Hylton1afc1692008-06-18 20:49:58 +000095import sys
96import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000097import collections
Senthil Kumarane24f96a2012-03-13 19:29:33 -070098import tempfile
99import contextlib
Senthil Kumaran38b968b92012-03-14 13:43:53 -0700100import warnings
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700101
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000102
Georg Brandl13e89462008-07-01 19:56:00 +0000103from urllib.error import URLError, HTTPError, ContentTooShortError
104from urllib.parse import (
105 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
106 splittype, splithost, splitport, splituser, splitpasswd,
Antoine Pitroudf204be2012-11-24 17:59:08 +0100107 splitattr, splitquery, splitvalue, splittag, to_bytes,
108 unquote_to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000109from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110
111# check for SSL
112try:
113 import ssl
Brett Cannoncd171c82013-07-04 17:43:24 -0400114except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000115 _have_ssl = False
116else:
117 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000118
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800119__all__ = [
120 # Classes
121 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
122 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
123 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
R David Murray4c7f9952015-04-16 16:36:18 -0400124 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
125 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
126 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
127 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800128 'UnknownHandler', 'HTTPErrorProcessor',
129 # Functions
130 'urlopen', 'install_opener', 'build_opener',
131 'pathname2url', 'url2pathname', 'getproxies',
132 # Legacy interface
133 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
134]
135
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000136# used in User-Agent header sent
Serhiy Storchaka885bdc42016-02-11 13:10:36 +0200137__version__ = '%d.%d' % sys.version_info[:2]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000138
139_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000140def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
Senthil Kumarana5c85b32014-09-19 15:23:30 +0800141 *, cafile=None, capath=None, cadefault=False, context=None):
Raymond Hettinger507343a2015-08-18 00:35:52 -0700142 '''Open the URL url, which can be either a string or a Request object.
143
144 *data* must be a bytes object specifying additional data to be sent to the
145 server, or None if no such data is needed. data may also be an iterable
146 object and in that case Content-Length value must be specified in the
147 headers. Currently HTTP requests are the only ones that use data; the HTTP
148 request will be a POST instead of a GET when the data parameter is
149 provided.
150
151 *data* should be a buffer in the standard application/x-www-form-urlencoded
152 format. The urllib.parse.urlencode() function takes a mapping or sequence
Martin Panterf65dd1d2015-11-24 23:00:37 +0000153 of 2-tuples and returns an ASCII text string in this format. It should be
154 encoded to bytes before being used as the data parameter.
Raymond Hettinger507343a2015-08-18 00:35:52 -0700155
156 urllib.request module uses HTTP/1.1 and includes a "Connection:close"
157 header in its HTTP requests.
158
159 The optional *timeout* parameter specifies a timeout in seconds for
160 blocking operations like the connection attempt (if not specified, the
161 global default timeout setting will be used). This only works for HTTP,
162 HTTPS and FTP connections.
163
164 If *context* is specified, it must be a ssl.SSLContext instance describing
165 the various SSL options. See HTTPSConnection for more details.
166
167 The optional *cafile* and *capath* parameters specify a set of trusted CA
168 certificates for HTTPS requests. cafile should point to a single file
169 containing a bundle of CA certificates, whereas capath should point to a
170 directory of hashed certificate files. More information can be found in
171 ssl.SSLContext.load_verify_locations().
172
173 The *cadefault* parameter is ignored.
174
Martin Panter29f256902016-06-04 05:06:34 +0000175 This function always returns an object which can work as a context
176 manager and has methods such as
Raymond Hettinger507343a2015-08-18 00:35:52 -0700177
Serhiy Storchaka3fd4a732015-12-18 13:10:37 +0200178 * geturl() - return the URL of the resource retrieved, commonly used to
Raymond Hettinger507343a2015-08-18 00:35:52 -0700179 determine if a redirect was followed
180
Serhiy Storchaka3fd4a732015-12-18 13:10:37 +0200181 * info() - return the meta-information of the page, such as headers, in the
Raymond Hettinger507343a2015-08-18 00:35:52 -0700182 form of an email.message_from_string() instance (see Quick Reference to
183 HTTP Headers)
184
Serhiy Storchaka3fd4a732015-12-18 13:10:37 +0200185 * getcode() - return the HTTP status code of the response. Raises URLError
Raymond Hettinger507343a2015-08-18 00:35:52 -0700186 on errors.
187
Martin Panter29f256902016-06-04 05:06:34 +0000188 For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
189 object slightly modified. In addition to the three new methods above, the
190 msg attribute contains the same information as the reason attribute ---
191 the reason phrase returned by the server --- instead of the response
192 headers as it is specified in the documentation for HTTPResponse.
R David Murrayd2367c62016-06-03 20:16:06 -0400193
Martin Panter29f256902016-06-04 05:06:34 +0000194 For FTP, file, and data URLs and requests explicitly handled by legacy
195 URLopener and FancyURLopener classes, this function returns a
196 urllib.response.addinfourl object.
197
198 Note that None may be returned if no handler handles the request (though
Raymond Hettinger507343a2015-08-18 00:35:52 -0700199 the default installed global OpenerDirector uses UnknownHandler to ensure
200 this never happens).
201
202 In addition, if proxy settings are detected (for example, when a *_proxy
203 environment variable like http_proxy is set), ProxyHandler is default
204 installed and makes sure the requests are handled through the proxy.
205
206 '''
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000207 global _opener
Antoine Pitroude9ac6c2012-05-16 21:40:01 +0200208 if cafile or capath or cadefault:
Senthil Kumarana5c85b32014-09-19 15:23:30 +0800209 if context is not None:
210 raise ValueError(
211 "You can't pass both context and any of cafile, capath, and "
212 "cadefault"
213 )
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000214 if not _have_ssl:
215 raise ValueError('SSL support not available')
Benjamin Petersonb6666972014-12-07 13:46:02 -0500216 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
Christian Heimes67986f92013-11-23 22:43:47 +0100217 cafile=cafile,
218 capath=capath)
Benjamin Petersonb6666972014-12-07 13:46:02 -0500219 https_handler = HTTPSHandler(context=context)
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000220 opener = build_opener(https_handler)
Senthil Kumarana5c85b32014-09-19 15:23:30 +0800221 elif context:
222 https_handler = HTTPSHandler(context=context)
223 opener = build_opener(https_handler)
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000224 elif _opener is None:
225 _opener = opener = build_opener()
226 else:
227 opener = _opener
228 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000229
230def install_opener(opener):
231 global _opener
232 _opener = opener
233
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700234_url_tempfiles = []
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000235def urlretrieve(url, filename=None, reporthook=None, data=None):
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700236 """
237 Retrieve a URL into a temporary location on disk.
238
239 Requires a URL argument. If a filename is passed, it is used as
240 the temporary file location. The reporthook argument should be
241 a callable that accepts a block number, a read size, and the
242 total file size of the URL target. The data argument should be
243 valid URL encoded data.
244
245 If a filename is passed and the URL points to a local resource,
246 the result is a copy from local file to new file.
247
248 Returns a tuple containing the path to the newly created
249 data file as well as the resulting HTTPMessage object.
250 """
251 url_type, path = splittype(url)
252
253 with contextlib.closing(urlopen(url, data)) as fp:
254 headers = fp.info()
255
256 # Just return the local path and the "headers" for file://
257 # URLs. No sense in performing a copy unless requested.
258 if url_type == "file" and not filename:
259 return os.path.normpath(path), headers
260
261 # Handle temporary file setup.
262 if filename:
263 tfp = open(filename, 'wb')
264 else:
265 tfp = tempfile.NamedTemporaryFile(delete=False)
266 filename = tfp.name
267 _url_tempfiles.append(filename)
268
269 with tfp:
270 result = filename, headers
271 bs = 1024*8
272 size = -1
273 read = 0
274 blocknum = 0
275 if "content-length" in headers:
276 size = int(headers["Content-Length"])
277
278 if reporthook:
Gregory P. Smith6b0bdab2012-11-10 13:43:44 -0800279 reporthook(blocknum, bs, size)
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700280
281 while True:
282 block = fp.read(bs)
283 if not block:
284 break
285 read += len(block)
286 tfp.write(block)
287 blocknum += 1
288 if reporthook:
Gregory P. Smith6b0bdab2012-11-10 13:43:44 -0800289 reporthook(blocknum, bs, size)
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700290
291 if size >= 0 and read < size:
292 raise ContentTooShortError(
293 "retrieval incomplete: got only %i out of %i bytes"
294 % (read, size), result)
295
296 return result
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000297
298def urlcleanup():
Robert Collins2fee5c92015-08-04 12:52:06 +1200299 """Clean up temporary files from urlretrieve calls."""
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700300 for temp_file in _url_tempfiles:
301 try:
302 os.unlink(temp_file)
Andrew Svetlov3438fa42012-12-17 23:35:18 +0200303 except OSError:
Senthil Kumarane24f96a2012-03-13 19:29:33 -0700304 pass
305
306 del _url_tempfiles[:]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000307 global _opener
308 if _opener:
309 _opener = None
310
311# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000312_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000313def request_host(request):
314 """Return request-host, as defined by RFC 2965.
315
316 Variation from RFC: returned value is lowercased, for convenient
317 comparison.
318
319 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000320 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000321 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000322 if host == "":
323 host = request.get_header("Host", "")
324
325 # remove port, if present
326 host = _cut_port_re.sub("", host, 1)
327 return host.lower()
328
329class Request:
330
331 def __init__(self, url, data=None, headers={},
Senthil Kumarande49d642011-10-16 23:54:44 +0800332 origin_req_host=None, unverifiable=False,
333 method=None):
Senthil Kumaran52380922013-04-25 05:45:48 -0700334 self.full_url = url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000335 self.headers = {}
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200336 self.unredirected_hdrs = {}
337 self._data = None
338 self.data = data
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000339 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000340 for key, value in headers.items():
341 self.add_header(key, value)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000342 if origin_req_host is None:
343 origin_req_host = request_host(self)
344 self.origin_req_host = origin_req_host
345 self.unverifiable = unverifiable
Jason R. Coombs7dc4f4b2013-09-08 12:47:07 -0400346 if method:
347 self.method = method
Senthil Kumaran52380922013-04-25 05:45:48 -0700348
349 @property
350 def full_url(self):
Senthil Kumaran83070752013-05-24 09:14:12 -0700351 if self.fragment:
352 return '{}#{}'.format(self._full_url, self.fragment)
Senthil Kumaran52380922013-04-25 05:45:48 -0700353 return self._full_url
354
355 @full_url.setter
356 def full_url(self, url):
357 # unwrap('<URL:type://host/path>') --> 'type://host/path'
358 self._full_url = unwrap(url)
359 self._full_url, self.fragment = splittag(self._full_url)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000360 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000361
Senthil Kumaran52380922013-04-25 05:45:48 -0700362 @full_url.deleter
363 def full_url(self):
364 self._full_url = None
365 self.fragment = None
366 self.selector = ''
367
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200368 @property
369 def data(self):
370 return self._data
371
372 @data.setter
373 def data(self, data):
374 if data != self._data:
375 self._data = data
376 # issue 16464
377 # if we change data we need to remove content-length header
378 # (cause it's most probably calculated for previous value)
379 if self.has_header("Content-length"):
380 self.remove_header("Content-length")
381
382 @data.deleter
383 def data(self):
R David Murray9cc7d452013-03-20 00:10:51 -0400384 self.data = None
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200385
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000386 def _parse(self):
Senthil Kumaran52380922013-04-25 05:45:48 -0700387 self.type, rest = splittype(self._full_url)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000388 if self.type is None:
R David Murrayd8a46962013-04-03 06:58:34 -0400389 raise ValueError("unknown url type: %r" % self.full_url)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000390 self.host, self.selector = splithost(rest)
391 if self.host:
392 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000393
394 def get_method(self):
Senthil Kumarande49d642011-10-16 23:54:44 +0800395 """Return a string indicating the HTTP request method."""
Jason R. Coombsaae6a1d2013-09-08 12:54:33 -0400396 default_method = "POST" if self.data is not None else "GET"
397 return getattr(self, 'method', default_method)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000398
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000399 def get_full_url(self):
Senthil Kumaran52380922013-04-25 05:45:48 -0700400 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000401
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000402 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000403 if self.type == 'https' and not self._tunnel_host:
404 self._tunnel_host = self.host
405 else:
406 self.type= type
407 self.selector = self.full_url
408 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000409
410 def has_proxy(self):
411 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000412
413 def add_header(self, key, val):
414 # useful for something like authentication
415 self.headers[key.capitalize()] = val
416
417 def add_unredirected_header(self, key, val):
418 # will not be added to a redirected request
419 self.unredirected_hdrs[key.capitalize()] = val
420
421 def has_header(self, header_name):
422 return (header_name in self.headers or
423 header_name in self.unredirected_hdrs)
424
425 def get_header(self, header_name, default=None):
426 return self.headers.get(
427 header_name,
428 self.unredirected_hdrs.get(header_name, default))
429
Andrew Svetlovbff98fe2012-11-27 23:06:19 +0200430 def remove_header(self, header_name):
431 self.headers.pop(header_name, None)
432 self.unredirected_hdrs.pop(header_name, None)
433
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000434 def header_items(self):
435 hdrs = self.unredirected_hdrs.copy()
436 hdrs.update(self.headers)
437 return list(hdrs.items())
438
439class OpenerDirector:
440 def __init__(self):
441 client_version = "Python-urllib/%s" % __version__
442 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000443 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000444 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000445 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000446 self.handle_open = {}
447 self.handle_error = {}
448 self.process_response = {}
449 self.process_request = {}
450
451 def add_handler(self, handler):
452 if not hasattr(handler, "add_parent"):
453 raise TypeError("expected BaseHandler instance, got %r" %
454 type(handler))
455
456 added = False
457 for meth in dir(handler):
458 if meth in ["redirect_request", "do_open", "proxy_open"]:
459 # oops, coincidental match
460 continue
461
462 i = meth.find("_")
463 protocol = meth[:i]
464 condition = meth[i+1:]
465
466 if condition.startswith("error"):
467 j = condition.find("_") + i + 1
468 kind = meth[j+1:]
469 try:
470 kind = int(kind)
471 except ValueError:
472 pass
473 lookup = self.handle_error.get(protocol, {})
474 self.handle_error[protocol] = lookup
475 elif condition == "open":
476 kind = protocol
477 lookup = self.handle_open
478 elif condition == "response":
479 kind = protocol
480 lookup = self.process_response
481 elif condition == "request":
482 kind = protocol
483 lookup = self.process_request
484 else:
485 continue
486
487 handlers = lookup.setdefault(kind, [])
488 if handlers:
489 bisect.insort(handlers, handler)
490 else:
491 handlers.append(handler)
492 added = True
493
494 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000495 bisect.insort(self.handlers, handler)
496 handler.add_parent(self)
497
498 def close(self):
499 # Only exists for backwards compatibility.
500 pass
501
502 def _call_chain(self, chain, kind, meth_name, *args):
503 # Handlers raise an exception if no one else should try to handle
504 # the request, or return None if they can't but another handler
505 # could. Otherwise, they return the response.
506 handlers = chain.get(kind, ())
507 for handler in handlers:
508 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000509 result = func(*args)
510 if result is not None:
511 return result
512
513 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
514 # accept a URL or a Request object
515 if isinstance(fullurl, str):
516 req = Request(fullurl, data)
517 else:
518 req = fullurl
519 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000520 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000521
522 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000523 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000524
525 # pre-process request
526 meth_name = protocol+"_request"
527 for processor in self.process_request.get(protocol, []):
528 meth = getattr(processor, meth_name)
529 req = meth(req)
530
531 response = self._open(req, data)
532
533 # post-process response
534 meth_name = protocol+"_response"
535 for processor in self.process_response.get(protocol, []):
536 meth = getattr(processor, meth_name)
537 response = meth(req, response)
538
539 return response
540
541 def _open(self, req, data=None):
542 result = self._call_chain(self.handle_open, 'default',
543 'default_open', req)
544 if result:
545 return result
546
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000547 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000548 result = self._call_chain(self.handle_open, protocol, protocol +
549 '_open', req)
550 if result:
551 return result
552
553 return self._call_chain(self.handle_open, 'unknown',
554 'unknown_open', req)
555
556 def error(self, proto, *args):
557 if proto in ('http', 'https'):
558 # XXX http[s] protocols are special-cased
559 dict = self.handle_error['http'] # https is not different than http
560 proto = args[2] # YUCK!
561 meth_name = 'http_error_%s' % proto
562 http_err = 1
563 orig_args = args
564 else:
565 dict = self.handle_error
566 meth_name = proto + '_error'
567 http_err = 0
568 args = (dict, proto, meth_name) + args
569 result = self._call_chain(*args)
570 if result:
571 return result
572
573 if http_err:
574 args = (dict, 'default', 'http_error_default') + orig_args
575 return self._call_chain(*args)
576
577# XXX probably also want an abstract factory that knows when it makes
578# sense to skip a superclass in favor of a subclass and when it might
579# make sense to include both
580
581def build_opener(*handlers):
582 """Create an opener object from a list of handlers.
583
584 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000585 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000586
587 If any of the handlers passed as arguments are subclasses of the
588 default handlers, the default handlers will not be used.
589 """
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000590 opener = OpenerDirector()
591 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
592 HTTPDefaultErrorHandler, HTTPRedirectHandler,
Antoine Pitroudf204be2012-11-24 17:59:08 +0100593 FTPHandler, FileHandler, HTTPErrorProcessor,
594 DataHandler]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000595 if hasattr(http.client, "HTTPSConnection"):
596 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000597 skip = set()
598 for klass in default_classes:
599 for check in handlers:
Benjamin Peterson78c85382014-04-01 16:27:30 -0400600 if isinstance(check, type):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000601 if issubclass(check, klass):
602 skip.add(klass)
603 elif isinstance(check, klass):
604 skip.add(klass)
605 for klass in skip:
606 default_classes.remove(klass)
607
608 for klass in default_classes:
609 opener.add_handler(klass())
610
611 for h in handlers:
Benjamin Peterson5dd3cae2014-04-01 14:20:56 -0400612 if isinstance(h, type):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000613 h = h()
614 opener.add_handler(h)
615 return opener
616
617class BaseHandler:
618 handler_order = 500
619
620 def add_parent(self, parent):
621 self.parent = parent
622
623 def close(self):
624 # Only exists for backwards compatibility
625 pass
626
627 def __lt__(self, other):
628 if not hasattr(other, "handler_order"):
629 # Try to preserve the old behavior of having custom classes
630 # inserted after default ones (works only for custom user
631 # classes which are not aware of handler_order).
632 return True
633 return self.handler_order < other.handler_order
634
635
636class HTTPErrorProcessor(BaseHandler):
637 """Process HTTP error responses."""
638 handler_order = 1000 # after all other processing
639
640 def http_response(self, request, response):
641 code, msg, hdrs = response.code, response.msg, response.info()
642
643 # According to RFC 2616, "2xx" code indicates that the client's
644 # request was successfully received, understood, and accepted.
645 if not (200 <= code < 300):
646 response = self.parent.error(
647 'http', request, response, code, msg, hdrs)
648
649 return response
650
651 https_response = http_response
652
653class HTTPDefaultErrorHandler(BaseHandler):
654 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000655 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000656
657class HTTPRedirectHandler(BaseHandler):
658 # maximum number of redirections to any single URL
659 # this is needed because of the state that cookies introduce
660 max_repeats = 4
661 # maximum total number of redirections (regardless of URL) before
662 # assuming we're in a loop
663 max_redirections = 10
664
665 def redirect_request(self, req, fp, code, msg, headers, newurl):
666 """Return a Request or None in response to a redirect.
667
668 This is called by the http_error_30x methods when a
669 redirection response is received. If a redirection should
670 take place, return a new Request to allow http_error_30x to
671 perform the redirect. Otherwise, raise HTTPError if no-one
672 else should try to handle this url. Return None if you can't
673 but another Handler might.
674 """
675 m = req.get_method()
676 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
677 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000678 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000679
680 # Strictly (according to RFC 2616), 301 or 302 in response to
681 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000682 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000683 # essentially all clients do redirect in this case, so we do
684 # the same.
Martin Pantere6f06092016-05-16 01:14:20 +0000685
686 # Be conciliant with URIs containing a space. This is mainly
687 # redundant with the more complete encoding done in http_error_302(),
688 # but it is kept for compatibility with other callers.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000689 newurl = newurl.replace(' ', '%20')
Martin Pantere6f06092016-05-16 01:14:20 +0000690
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000691 CONTENT_HEADERS = ("content-length", "content-type")
692 newheaders = dict((k, v) for k, v in req.headers.items()
693 if k.lower() not in CONTENT_HEADERS)
694 return Request(newurl,
695 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000696 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000697 unverifiable=True)
698
699 # Implementation note: To avoid the server sending us into an
700 # infinite loop, the request object needs to track what URLs we
701 # have already seen. Do this by adding a handler-specific
702 # attribute to the Request object.
703 def http_error_302(self, req, fp, code, msg, headers):
704 # Some servers (incorrectly) return multiple Location headers
705 # (so probably same goes for URI). Use first header.
706 if "location" in headers:
707 newurl = headers["location"]
708 elif "uri" in headers:
709 newurl = headers["uri"]
710 else:
711 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000712
713 # fix a possible malformed URL
714 urlparts = urlparse(newurl)
guido@google.coma119df92011-03-29 11:41:02 -0700715
716 # For security reasons we don't allow redirection to anything other
717 # than http, https or ftp.
718
Senthil Kumaran6497aa32012-01-04 13:46:59 +0800719 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800720 raise HTTPError(
721 newurl, code,
722 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
723 headers, fp)
guido@google.coma119df92011-03-29 11:41:02 -0700724
Martin Panterce6e0682016-05-16 01:07:13 +0000725 if not urlparts.path and urlparts.netloc:
Facundo Batistaf24802c2008-08-17 03:36:03 +0000726 urlparts = list(urlparts)
727 urlparts[2] = "/"
728 newurl = urlunparse(urlparts)
729
Martin Pantere6f06092016-05-16 01:14:20 +0000730 # http.client.parse_headers() decodes as ISO-8859-1. Recover the
731 # original bytes and percent-encode non-ASCII bytes, and any special
732 # characters such as the space.
733 newurl = quote(
734 newurl, encoding="iso-8859-1", safe=string.punctuation)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000735 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000736
737 # XXX Probably want to forget about the state of the current
738 # request, although that might interact poorly with other
739 # handlers that also use handler-specific request attributes
740 new = self.redirect_request(req, fp, code, msg, headers, newurl)
741 if new is None:
742 return
743
744 # loop detection
745 # .redirect_dict has a key url if url was previously visited.
746 if hasattr(req, 'redirect_dict'):
747 visited = new.redirect_dict = req.redirect_dict
748 if (visited.get(newurl, 0) >= self.max_repeats or
749 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000750 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000751 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000752 else:
753 visited = new.redirect_dict = req.redirect_dict = {}
754 visited[newurl] = visited.get(newurl, 0) + 1
755
756 # Don't close the fp until we are sure that we won't use it
757 # with HTTPError.
758 fp.read()
759 fp.close()
760
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000761 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000762
763 http_error_301 = http_error_303 = http_error_307 = http_error_302
764
765 inf_msg = "The HTTP server returned a redirect error that would " \
766 "lead to an infinite loop.\n" \
767 "The last 30x error message was:\n"
768
769
770def _parse_proxy(proxy):
771 """Return (scheme, user, password, host/port) given a URL or an authority.
772
773 If a URL is supplied, it must have an authority (host:port) component.
774 According to RFC 3986, having an authority component means the URL must
Senthil Kumarand8e24f12014-04-14 16:32:20 -0400775 have two slashes after the scheme.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000776 """
Georg Brandl13e89462008-07-01 19:56:00 +0000777 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000778 if not r_scheme.startswith("/"):
779 # authority
780 scheme = None
781 authority = proxy
782 else:
783 # URL
784 if not r_scheme.startswith("//"):
785 raise ValueError("proxy URL with no authority: %r" % proxy)
786 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
787 # and 3.3.), path is empty or starts with '/'
788 end = r_scheme.find("/", 2)
789 if end == -1:
790 end = None
791 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000792 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000793 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000794 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000795 else:
796 user = password = None
797 return scheme, user, password, hostport
798
799class ProxyHandler(BaseHandler):
800 # Proxies must be in front
801 handler_order = 100
802
803 def __init__(self, proxies=None):
804 if proxies is None:
805 proxies = getproxies()
806 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
807 self.proxies = proxies
808 for type, url in proxies.items():
809 setattr(self, '%s_open' % type,
Georg Brandlfcbdbf22012-06-24 19:56:31 +0200810 lambda r, proxy=url, type=type, meth=self.proxy_open:
811 meth(r, proxy, type))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000812
813 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000814 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000815 proxy_type, user, password, hostport = _parse_proxy(proxy)
816 if proxy_type is None:
817 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000818
819 if req.host and proxy_bypass(req.host):
820 return None
821
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000822 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000823 user_pass = '%s:%s' % (unquote(user),
824 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000825 creds = base64.b64encode(user_pass.encode()).decode("ascii")
826 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000827 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000828 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000829 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000830 # let other handlers take care of it
831 return None
832 else:
833 # need to start over, because the other handlers don't
834 # grok the proxy's URL type
835 # e.g. if we have a constructor arg proxies like so:
836 # {'http': 'ftp://proxy.example.com'}, we may end up turning
837 # a request for http://acme.example.com/a into one for
838 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000839 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000840
841class HTTPPasswordMgr:
842
843 def __init__(self):
844 self.passwd = {}
845
846 def add_password(self, realm, uri, user, passwd):
847 # uri could be a single URI or a sequence
848 if isinstance(uri, str):
849 uri = [uri]
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800850 if realm not in self.passwd:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000851 self.passwd[realm] = {}
852 for default_port in True, False:
853 reduced_uri = tuple(
854 [self.reduce_uri(u, default_port) for u in uri])
855 self.passwd[realm][reduced_uri] = (user, passwd)
856
857 def find_user_password(self, realm, authuri):
858 domains = self.passwd.get(realm, {})
859 for default_port in True, False:
860 reduced_authuri = self.reduce_uri(authuri, default_port)
861 for uris, authinfo in domains.items():
862 for uri in uris:
863 if self.is_suburi(uri, reduced_authuri):
864 return authinfo
865 return None, None
866
867 def reduce_uri(self, uri, default_port=True):
868 """Accept authority or URI and extract only the authority and path."""
869 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000870 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000871 if parts[1]:
872 # URI
873 scheme = parts[0]
874 authority = parts[1]
875 path = parts[2] or '/'
876 else:
877 # host or host:port
878 scheme = None
879 authority = uri
880 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000881 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000882 if default_port and port is None and scheme is not None:
883 dport = {"http": 80,
884 "https": 443,
885 }.get(scheme)
886 if dport is not None:
887 authority = "%s:%d" % (host, dport)
888 return authority, path
889
890 def is_suburi(self, base, test):
891 """Check if test is below base in a URI tree
892
893 Both args must be URIs in reduced form.
894 """
895 if base == test:
896 return True
897 if base[0] != test[0]:
898 return False
899 common = posixpath.commonprefix((base[1], test[1]))
900 if len(common) == len(base[1]):
901 return True
902 return False
903
904
905class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
906
907 def find_user_password(self, realm, authuri):
908 user, password = HTTPPasswordMgr.find_user_password(self, realm,
909 authuri)
910 if user is not None:
911 return user, password
912 return HTTPPasswordMgr.find_user_password(self, None, authuri)
913
914
R David Murray4c7f9952015-04-16 16:36:18 -0400915class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
916
917 def __init__(self, *args, **kwargs):
918 self.authenticated = {}
919 super().__init__(*args, **kwargs)
920
921 def add_password(self, realm, uri, user, passwd, is_authenticated=False):
922 self.update_authenticated(uri, is_authenticated)
923 # Add a default for prior auth requests
924 if realm is not None:
925 super().add_password(None, uri, user, passwd)
926 super().add_password(realm, uri, user, passwd)
927
928 def update_authenticated(self, uri, is_authenticated=False):
929 # uri could be a single URI or a sequence
930 if isinstance(uri, str):
931 uri = [uri]
932
933 for default_port in True, False:
934 for u in uri:
935 reduced_uri = self.reduce_uri(u, default_port)
936 self.authenticated[reduced_uri] = is_authenticated
937
938 def is_authenticated(self, authuri):
939 for default_port in True, False:
940 reduced_authuri = self.reduce_uri(authuri, default_port)
941 for uri in self.authenticated:
942 if self.is_suburi(uri, reduced_authuri):
943 return self.authenticated[uri]
944
945
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000946class AbstractBasicAuthHandler:
947
948 # XXX this allows for multiple auth-schemes, but will stupidly pick
949 # the last one with a realm specified.
950
951 # allow for double- and single-quoted realm values
952 # (single quotes are a violation of the RFC, but appear in the wild)
953 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
Senthil Kumaran34f3fcc2012-05-15 22:30:25 +0800954 'realm=(["\']?)([^"\']*)\\2', re.I)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000955
956 # XXX could pre-emptively send auth info already accepted (RFC 2617,
957 # end of section 2, and section 1.2 immediately after "credentials"
958 # production).
959
960 def __init__(self, password_mgr=None):
961 if password_mgr is None:
962 password_mgr = HTTPPasswordMgr()
963 self.passwd = password_mgr
964 self.add_password = self.passwd.add_password
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000965
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000966 def http_error_auth_reqed(self, authreq, host, req, headers):
967 # host may be an authority (without userinfo) or a URL with an
968 # authority
969 # XXX could be multiple headers
970 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000971
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000972 if authreq:
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800973 scheme = authreq.split()[0]
Senthil Kumaran1a129c82011-10-20 02:50:13 +0800974 if scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800975 raise ValueError("AbstractBasicAuthHandler does not"
976 " support the following scheme: '%s'" %
977 scheme)
978 else:
979 mo = AbstractBasicAuthHandler.rx.search(authreq)
980 if mo:
981 scheme, quote, realm = mo.groups()
Senthil Kumaran92a5bf02012-05-16 00:03:29 +0800982 if quote not in ['"',"'"]:
983 warnings.warn("Basic Auth Realm was unquoted",
984 UserWarning, 2)
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800985 if scheme.lower() == 'basic':
Senthil Kumaran78373762014-08-20 07:53:58 +0530986 return self.retry_http_basic_auth(host, req, realm)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000987
988 def retry_http_basic_auth(self, host, req, realm):
989 user, pw = self.passwd.find_user_password(realm, host)
990 if pw is not None:
991 raw = "%s:%s" % (user, pw)
992 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
Senthil Kumaran78373762014-08-20 07:53:58 +0530993 if req.get_header(self.auth_header, None) == auth:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000994 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000995 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000996 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000997 else:
998 return None
999
R David Murray4c7f9952015-04-16 16:36:18 -04001000 def http_request(self, req):
1001 if (not hasattr(self.passwd, 'is_authenticated') or
1002 not self.passwd.is_authenticated(req.full_url)):
1003 return req
1004
1005 if not req.has_header('Authorization'):
1006 user, passwd = self.passwd.find_user_password(None, req.full_url)
1007 credentials = '{0}:{1}'.format(user, passwd).encode()
1008 auth_str = base64.standard_b64encode(credentials).decode()
1009 req.add_unredirected_header('Authorization',
1010 'Basic {}'.format(auth_str.strip()))
1011 return req
1012
1013 def http_response(self, req, response):
1014 if hasattr(self.passwd, 'is_authenticated'):
1015 if 200 <= response.code < 300:
1016 self.passwd.update_authenticated(req.full_url, True)
1017 else:
1018 self.passwd.update_authenticated(req.full_url, False)
1019 return response
1020
1021 https_request = http_request
1022 https_response = http_response
1023
1024
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001025
1026class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1027
1028 auth_header = 'Authorization'
1029
1030 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001031 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001032 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001033 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001034 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001035
1036
1037class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1038
1039 auth_header = 'Proxy-authorization'
1040
1041 def http_error_407(self, req, fp, code, msg, headers):
1042 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +00001043 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001044 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1045 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001046 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001047 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001048 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +00001049 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001050
1051
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001052# Return n random bytes.
1053_randombytes = os.urandom
1054
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001055
1056class AbstractDigestAuthHandler:
1057 # Digest authentication is specified in RFC 2617.
1058
1059 # XXX The client does not inspect the Authentication-Info header
1060 # in a successful response.
1061
1062 # XXX It should be possible to test this implementation against
1063 # a mock server that just generates a static set of challenges.
1064
1065 # XXX qop="auth-int" supports is shaky
1066
1067 def __init__(self, passwd=None):
1068 if passwd is None:
1069 passwd = HTTPPasswordMgr()
1070 self.passwd = passwd
1071 self.add_password = self.passwd.add_password
1072 self.retried = 0
1073 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +00001074 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001075
1076 def reset_retry_count(self):
1077 self.retried = 0
1078
1079 def http_error_auth_reqed(self, auth_header, host, req, headers):
1080 authreq = headers.get(auth_header, None)
1081 if self.retried > 5:
1082 # Don't fail endlessly - if we failed once, we'll probably
1083 # fail a second time. Hm. Unless the Password Manager is
1084 # prompting for the information. Crap. This isn't great
1085 # but it's better than the current 'repeat until recursion
1086 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001087 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +00001088 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001089 else:
1090 self.retried += 1
1091 if authreq:
1092 scheme = authreq.split()[0]
1093 if scheme.lower() == 'digest':
1094 return self.retry_http_digest_auth(req, authreq)
Senthil Kumaran1a129c82011-10-20 02:50:13 +08001095 elif scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +08001096 raise ValueError("AbstractDigestAuthHandler does not support"
1097 " the following scheme: '%s'" % scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001098
1099 def retry_http_digest_auth(self, req, auth):
1100 token, challenge = auth.split(' ', 1)
1101 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1102 auth = self.get_authorization(req, chal)
1103 if auth:
1104 auth_val = 'Digest %s' % auth
1105 if req.headers.get(self.auth_header, None) == auth_val:
1106 return None
1107 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +00001108 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001109 return resp
1110
1111 def get_cnonce(self, nonce):
1112 # The cnonce-value is an opaque
1113 # quoted string value provided by the client and used by both client
1114 # and server to avoid chosen plaintext attacks, to provide mutual
1115 # authentication, and to provide some message integrity protection.
1116 # This isn't a fabulous effort, but it's probably Good Enough.
1117 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001118 b = s.encode("ascii") + _randombytes(8)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001119 dig = hashlib.sha1(b).hexdigest()
1120 return dig[:16]
1121
1122 def get_authorization(self, req, chal):
1123 try:
1124 realm = chal['realm']
1125 nonce = chal['nonce']
1126 qop = chal.get('qop')
1127 algorithm = chal.get('algorithm', 'MD5')
1128 # mod_digest doesn't send an opaque, even though it isn't
1129 # supposed to be optional
1130 opaque = chal.get('opaque', None)
1131 except KeyError:
1132 return None
1133
1134 H, KD = self.get_algorithm_impls(algorithm)
1135 if H is None:
1136 return None
1137
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001138 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001139 if user is None:
1140 return None
1141
1142 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001143 if req.data is not None:
1144 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001145 else:
1146 entdig = None
1147
1148 A1 = "%s:%s:%s" % (user, realm, pw)
1149 A2 = "%s:%s" % (req.get_method(),
1150 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001151 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001152 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +00001153 if nonce == self.last_nonce:
1154 self.nonce_count += 1
1155 else:
1156 self.nonce_count = 1
1157 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001158 ncvalue = '%08x' % self.nonce_count
1159 cnonce = self.get_cnonce(nonce)
1160 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1161 respdig = KD(H(A1), noncebit)
1162 elif qop is None:
1163 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1164 else:
1165 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +00001166 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001167
1168 # XXX should the partial digests be encoded too?
1169
1170 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001171 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001172 respdig)
1173 if opaque:
1174 base += ', opaque="%s"' % opaque
1175 if entdig:
1176 base += ', digest="%s"' % entdig
1177 base += ', algorithm="%s"' % algorithm
1178 if qop:
1179 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1180 return base
1181
1182 def get_algorithm_impls(self, algorithm):
1183 # lambdas assume digest modules are imported at the top level
1184 if algorithm == 'MD5':
1185 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1186 elif algorithm == 'SHA':
1187 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1188 # XXX MD5-sess
Berker Peksage88dd1c2016-03-06 16:16:40 +02001189 else:
1190 raise ValueError("Unsupported digest authentication "
1191 "algorithm %r" % algorithm)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001192 KD = lambda s, d: H("%s:%s" % (s, d))
1193 return H, KD
1194
1195 def get_entity_digest(self, data, chal):
1196 # XXX not implemented yet
1197 return None
1198
1199
1200class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1201 """An authentication protocol defined by RFC 2069
1202
1203 Digest authentication improves on basic authentication because it
1204 does not transmit passwords in the clear.
1205 """
1206
1207 auth_header = 'Authorization'
1208 handler_order = 490 # before Basic auth
1209
1210 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001211 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001212 retry = self.http_error_auth_reqed('www-authenticate',
1213 host, req, headers)
1214 self.reset_retry_count()
1215 return retry
1216
1217
1218class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1219
1220 auth_header = 'Proxy-Authorization'
1221 handler_order = 490 # before Basic auth
1222
1223 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001224 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001225 retry = self.http_error_auth_reqed('proxy-authenticate',
1226 host, req, headers)
1227 self.reset_retry_count()
1228 return retry
1229
1230class AbstractHTTPHandler(BaseHandler):
1231
1232 def __init__(self, debuglevel=0):
1233 self._debuglevel = debuglevel
1234
1235 def set_http_debuglevel(self, level):
1236 self._debuglevel = level
1237
1238 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001239 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001240 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001241 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001242
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001243 if request.data is not None: # POST
1244 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001245 if isinstance(data, str):
Georg Brandlfcbdbf22012-06-24 19:56:31 +02001246 msg = "POST data should be bytes or an iterable of bytes. " \
1247 "It cannot be of type str."
Senthil Kumaran6b3434a2012-03-15 18:11:16 -07001248 raise TypeError(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001249 if not request.has_header('Content-type'):
1250 request.add_unredirected_header(
1251 'Content-type',
1252 'application/x-www-form-urlencoded')
1253 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001254 try:
1255 mv = memoryview(data)
1256 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001257 if isinstance(data, collections.Iterable):
Georg Brandl61536042011-02-03 07:46:41 +00001258 raise ValueError("Content-Length should be specified "
1259 "for iterable data of type %r %r" % (type(data),
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001260 data))
1261 else:
1262 request.add_unredirected_header(
Senthil Kumaran1e991f22010-12-24 04:03:59 +00001263 'Content-length', '%d' % (len(mv) * mv.itemsize))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001264
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001265 sel_host = host
1266 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001267 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001268 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001269 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001270 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001271 for name, value in self.parent.addheaders:
1272 name = name.capitalize()
1273 if not request.has_header(name):
1274 request.add_unredirected_header(name, value)
1275
1276 return request
1277
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001278 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001279 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001280
1281 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001282 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001283 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001284 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001285 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001286
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001287 # will parse host:port
1288 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran9642eed2016-05-13 01:32:42 -07001289 h.set_debuglevel(self._debuglevel)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001290
1291 headers = dict(req.unredirected_hdrs)
1292 headers.update(dict((k, v) for k, v in req.headers.items()
1293 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001294
1295 # TODO(jhylton): Should this be redesigned to handle
1296 # persistent connections?
1297
1298 # We want to make an HTTP/1.1 request, but the addinfourl
1299 # class isn't prepared to deal with a persistent connection.
1300 # It will try to read all remaining data from the socket,
1301 # which will block while the server waits for the next request.
1302 # So make sure the connection gets closed after the (only)
1303 # request.
1304 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001305 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001306
1307 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001308 tunnel_headers = {}
1309 proxy_auth_hdr = "Proxy-Authorization"
1310 if proxy_auth_hdr in headers:
1311 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1312 # Proxy-Authorization should not be sent to origin
1313 # server.
1314 del headers[proxy_auth_hdr]
1315 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001316
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001317 try:
Serhiy Storchakaf54c3502014-09-06 21:41:39 +03001318 try:
1319 h.request(req.get_method(), req.selector, req.data, headers)
1320 except OSError as err: # timeout error
1321 raise URLError(err)
Senthil Kumaran45686b42011-07-27 09:31:03 +08001322 r = h.getresponse()
Serhiy Storchakaf54c3502014-09-06 21:41:39 +03001323 except:
1324 h.close()
1325 raise
1326
1327 # If the server does not send us a 'Connection: close' header,
1328 # HTTPConnection assumes the socket should be left open. Manually
1329 # mark the socket to be closed when this response object goes away.
1330 if h.sock:
1331 h.sock.close()
1332 h.sock = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001333
Senthil Kumaran26430412011-04-13 07:01:19 +08001334 r.url = req.get_full_url()
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001335 # This line replaces the .msg attribute of the HTTPResponse
1336 # with .headers, because urllib clients expect the response to
1337 # have the reason in .msg. It would be good to mark this
1338 # attribute is deprecated and get then to use info() or
1339 # .headers.
1340 r.msg = r.reason
1341 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001342
1343
1344class HTTPHandler(AbstractHTTPHandler):
1345
1346 def http_open(self, req):
1347 return self.do_open(http.client.HTTPConnection, req)
1348
1349 http_request = AbstractHTTPHandler.do_request_
1350
1351if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001352
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001353 class HTTPSHandler(AbstractHTTPHandler):
1354
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001355 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1356 AbstractHTTPHandler.__init__(self, debuglevel)
1357 self._context = context
1358 self._check_hostname = check_hostname
1359
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001360 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001361 return self.do_open(http.client.HTTPSConnection, req,
1362 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001363
1364 https_request = AbstractHTTPHandler.do_request_
1365
Senthil Kumaran4c875a92011-11-01 23:57:57 +08001366 __all__.append('HTTPSHandler')
Senthil Kumaran0d54eb92011-11-01 23:49:46 +08001367
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001368class HTTPCookieProcessor(BaseHandler):
1369 def __init__(self, cookiejar=None):
1370 import http.cookiejar
1371 if cookiejar is None:
1372 cookiejar = http.cookiejar.CookieJar()
1373 self.cookiejar = cookiejar
1374
1375 def http_request(self, request):
1376 self.cookiejar.add_cookie_header(request)
1377 return request
1378
1379 def http_response(self, request, response):
1380 self.cookiejar.extract_cookies(response, request)
1381 return response
1382
1383 https_request = http_request
1384 https_response = http_response
1385
1386class UnknownHandler(BaseHandler):
1387 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001388 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001389 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001390
1391def parse_keqv_list(l):
1392 """Parse list of key=value strings where keys are not duplicated."""
1393 parsed = {}
1394 for elt in l:
1395 k, v = elt.split('=', 1)
1396 if v[0] == '"' and v[-1] == '"':
1397 v = v[1:-1]
1398 parsed[k] = v
1399 return parsed
1400
1401def parse_http_list(s):
1402 """Parse lists as described by RFC 2068 Section 2.
1403
1404 In particular, parse comma-separated lists where the elements of
1405 the list may include quoted-strings. A quoted-string could
1406 contain a comma. A non-quoted string could have quotes in the
1407 middle. Neither commas nor quotes count if they are escaped.
1408 Only double-quotes count, not single-quotes.
1409 """
1410 res = []
1411 part = ''
1412
1413 escape = quote = False
1414 for cur in s:
1415 if escape:
1416 part += cur
1417 escape = False
1418 continue
1419 if quote:
1420 if cur == '\\':
1421 escape = True
1422 continue
1423 elif cur == '"':
1424 quote = False
1425 part += cur
1426 continue
1427
1428 if cur == ',':
1429 res.append(part)
1430 part = ''
1431 continue
1432
1433 if cur == '"':
1434 quote = True
1435
1436 part += cur
1437
1438 # append last part
1439 if part:
1440 res.append(part)
1441
1442 return [part.strip() for part in res]
1443
1444class FileHandler(BaseHandler):
1445 # Use local file or FTP depending on form of URL
1446 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001447 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001448 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1449 req.host != 'localhost'):
Senthil Kumaranbc07ac52014-07-22 00:15:20 -07001450 if not req.host in self.get_names():
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001451 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001452 else:
1453 return self.open_local_file(req)
1454
1455 # names for the localhost
1456 names = None
1457 def get_names(self):
1458 if FileHandler.names is None:
1459 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001460 FileHandler.names = tuple(
1461 socket.gethostbyname_ex('localhost')[2] +
1462 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001463 except socket.gaierror:
1464 FileHandler.names = (socket.gethostbyname('localhost'),)
1465 return FileHandler.names
1466
1467 # not entirely sure what the rules are here
1468 def open_local_file(self, req):
1469 import email.utils
1470 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001471 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001472 filename = req.selector
1473 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001474 try:
1475 stats = os.stat(localfile)
1476 size = stats.st_size
1477 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001478 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001479 headers = email.message_from_string(
1480 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1481 (mtype or 'text/plain', size, modified))
1482 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001483 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001484 if not host or \
1485 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001486 if host:
1487 origurl = 'file://' + host + filename
1488 else:
1489 origurl = 'file://' + filename
1490 return addinfourl(open(localfile, 'rb'), headers, origurl)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001491 except OSError as exp:
Georg Brandl029986a2008-06-23 11:44:14 +00001492 # users shouldn't expect OSErrors coming from urlopen()
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001493 raise URLError(exp)
Georg Brandl13e89462008-07-01 19:56:00 +00001494 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001495
1496def _safe_gethostbyname(host):
1497 try:
1498 return socket.gethostbyname(host)
1499 except socket.gaierror:
1500 return None
1501
1502class FTPHandler(BaseHandler):
1503 def ftp_open(self, req):
1504 import ftplib
1505 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001506 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001507 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001508 raise URLError('ftp error: no host given')
1509 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001510 if port is None:
1511 port = ftplib.FTP_PORT
1512 else:
1513 port = int(port)
1514
1515 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001516 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001517 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001518 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001519 else:
1520 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001521 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001522 user = user or ''
1523 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001524
1525 try:
1526 host = socket.gethostbyname(host)
Andrew Svetlov0832af62012-12-18 23:10:48 +02001527 except OSError as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001528 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001529 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001530 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001531 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001532 dirs, file = dirs[:-1], dirs[-1]
1533 if dirs and not dirs[0]:
1534 dirs = dirs[1:]
1535 try:
1536 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1537 type = file and 'I' or 'D'
1538 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001539 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001540 if attr.lower() == 'type' and \
1541 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1542 type = value.upper()
1543 fp, retrlen = fw.retrfile(file, type)
1544 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001545 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001546 if mtype:
1547 headers += "Content-type: %s\n" % mtype
1548 if retrlen is not None and retrlen >= 0:
1549 headers += "Content-length: %d\n" % retrlen
1550 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001551 return addinfourl(fp, headers, req.full_url)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001552 except ftplib.all_errors as exp:
1553 exc = URLError('ftp error: %r' % exp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001554 raise exc.with_traceback(sys.exc_info()[2])
1555
1556 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001557 return ftpwrapper(user, passwd, host, port, dirs, timeout,
1558 persistent=False)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001559
1560class CacheFTPHandler(FTPHandler):
1561 # XXX would be nice to have pluggable cache strategies
1562 # XXX this stuff is definitely not thread safe
1563 def __init__(self):
1564 self.cache = {}
1565 self.timeout = {}
1566 self.soonest = 0
1567 self.delay = 60
1568 self.max_conns = 16
1569
1570 def setTimeout(self, t):
1571 self.delay = t
1572
1573 def setMaxConns(self, m):
1574 self.max_conns = m
1575
1576 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1577 key = user, host, port, '/'.join(dirs), timeout
1578 if key in self.cache:
1579 self.timeout[key] = time.time() + self.delay
1580 else:
1581 self.cache[key] = ftpwrapper(user, passwd, host, port,
1582 dirs, timeout)
1583 self.timeout[key] = time.time() + self.delay
1584 self.check_cache()
1585 return self.cache[key]
1586
1587 def check_cache(self):
1588 # first check for old ones
1589 t = time.time()
1590 if self.soonest <= t:
1591 for k, v in list(self.timeout.items()):
1592 if v < t:
1593 self.cache[k].close()
1594 del self.cache[k]
1595 del self.timeout[k]
1596 self.soonest = min(list(self.timeout.values()))
1597
1598 # then check the size
1599 if len(self.cache) == self.max_conns:
1600 for k, v in list(self.timeout.items()):
1601 if v == self.soonest:
1602 del self.cache[k]
1603 del self.timeout[k]
1604 break
1605 self.soonest = min(list(self.timeout.values()))
1606
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001607 def clear_cache(self):
1608 for conn in self.cache.values():
1609 conn.close()
1610 self.cache.clear()
1611 self.timeout.clear()
1612
Antoine Pitroudf204be2012-11-24 17:59:08 +01001613class DataHandler(BaseHandler):
1614 def data_open(self, req):
1615 # data URLs as specified in RFC 2397.
1616 #
1617 # ignores POSTed data
1618 #
1619 # syntax:
1620 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1621 # mediatype := [ type "/" subtype ] *( ";" parameter )
1622 # data := *urlchar
1623 # parameter := attribute "=" value
1624 url = req.full_url
1625
1626 scheme, data = url.split(":",1)
1627 mediatype, data = data.split(",",1)
1628
1629 # even base64 encoded data URLs might be quoted so unquote in any case:
1630 data = unquote_to_bytes(data)
1631 if mediatype.endswith(";base64"):
1632 data = base64.decodebytes(data)
1633 mediatype = mediatype[:-7]
1634
1635 if not mediatype:
1636 mediatype = "text/plain;charset=US-ASCII"
1637
1638 headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1639 (mediatype, len(data)))
1640
1641 return addinfourl(io.BytesIO(data), headers, url)
1642
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001643
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001644# Code move from the old urllib module
1645
1646MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1647
1648# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001649if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001650 from nturl2path import url2pathname, pathname2url
1651else:
1652 def url2pathname(pathname):
1653 """OS-specific conversion from a relative URL of the 'file' scheme
1654 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001655 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001656
1657 def pathname2url(pathname):
1658 """OS-specific conversion from a file system path to a relative URL
1659 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001660 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001661
1662# This really consists of two pieces:
1663# (1) a class which handles opening of all sorts of URLs
1664# (plus assorted utilities etc.)
1665# (2) a set of functions for parsing URLs
1666# XXX Should these be separated out into different modules?
1667
1668
1669ftpcache = {}
1670class URLopener:
1671 """Class to open URLs.
1672 This is a class rather than just a subroutine because we may need
1673 more than one set of global protocol-specific options.
1674 Note -- this is a base class for those who don't want the
1675 automatic handling of errors type 302 (relocated) and 401
1676 (authorization needed)."""
1677
1678 __tempfiles = None
1679
1680 version = "Python-urllib/%s" % __version__
1681
1682 # Constructor
1683 def __init__(self, proxies=None, **x509):
Georg Brandlfcbdbf22012-06-24 19:56:31 +02001684 msg = "%(class)s style of invoking requests is deprecated. " \
Senthil Kumaran38b968b92012-03-14 13:43:53 -07001685 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1686 warnings.warn(msg, DeprecationWarning, stacklevel=3)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001687 if proxies is None:
1688 proxies = getproxies()
1689 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1690 self.proxies = proxies
1691 self.key_file = x509.get('key_file')
1692 self.cert_file = x509.get('cert_file')
1693 self.addheaders = [('User-Agent', self.version)]
1694 self.__tempfiles = []
1695 self.__unlink = os.unlink # See cleanup()
1696 self.tempcache = None
1697 # Undocumented feature: if you assign {} to tempcache,
1698 # it is used to cache files retrieved with
1699 # self.retrieve(). This is not enabled by default
1700 # since it does not work for changing documents (and I
1701 # haven't got the logic to check expiration headers
1702 # yet).
1703 self.ftpcache = ftpcache
1704 # Undocumented feature: you can use a different
1705 # ftp cache by assigning to the .ftpcache member;
1706 # in case you want logically independent URL openers
1707 # XXX This is not threadsafe. Bah.
1708
1709 def __del__(self):
1710 self.close()
1711
1712 def close(self):
1713 self.cleanup()
1714
1715 def cleanup(self):
1716 # This code sometimes runs when the rest of this module
1717 # has already been deleted, so it can't use any globals
1718 # or import anything.
1719 if self.__tempfiles:
1720 for file in self.__tempfiles:
1721 try:
1722 self.__unlink(file)
1723 except OSError:
1724 pass
1725 del self.__tempfiles[:]
1726 if self.tempcache:
1727 self.tempcache.clear()
1728
1729 def addheader(self, *args):
1730 """Add a header to be used by the HTTP interface only
1731 e.g. u.addheader('Accept', 'sound/basic')"""
1732 self.addheaders.append(args)
1733
1734 # External interface
1735 def open(self, fullurl, data=None):
1736 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001737 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001738 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001739 if self.tempcache and fullurl in self.tempcache:
1740 filename, headers = self.tempcache[fullurl]
1741 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001742 return addinfourl(fp, headers, fullurl)
1743 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001744 if not urltype:
1745 urltype = 'file'
1746 if urltype in self.proxies:
1747 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001748 urltype, proxyhost = splittype(proxy)
1749 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001750 url = (host, fullurl) # Signal special case to open_*()
1751 else:
1752 proxy = None
1753 name = 'open_' + urltype
1754 self.type = urltype
1755 name = name.replace('-', '_')
1756 if not hasattr(self, name):
1757 if proxy:
1758 return self.open_unknown_proxy(proxy, fullurl, data)
1759 else:
1760 return self.open_unknown(fullurl, data)
1761 try:
1762 if data is None:
1763 return getattr(self, name)(url)
1764 else:
1765 return getattr(self, name)(url, data)
Senthil Kumaranf5776862012-10-21 13:30:02 -07001766 except (HTTPError, URLError):
Antoine Pitrou6b4883d2011-10-12 02:54:14 +02001767 raise
Andrew Svetlov0832af62012-12-18 23:10:48 +02001768 except OSError as msg:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001769 raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001770
1771 def open_unknown(self, fullurl, data=None):
1772 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001773 type, url = splittype(fullurl)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001774 raise OSError('url error', 'unknown url type', type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001775
1776 def open_unknown_proxy(self, proxy, fullurl, data=None):
1777 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001778 type, url = splittype(fullurl)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001779 raise OSError('url error', 'invalid proxy for %s' % type, proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001780
1781 # External interface
1782 def retrieve(self, url, filename=None, reporthook=None, data=None):
1783 """retrieve(url) returns (filename, headers) for a local object
1784 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001785 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001786 if self.tempcache and url in self.tempcache:
1787 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001788 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001789 if filename is None and (not type or type == 'file'):
1790 try:
1791 fp = self.open_local_file(url1)
1792 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001793 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001794 return url2pathname(splithost(url1)[1]), hdrs
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001795 except OSError as msg:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001796 pass
1797 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001798 try:
1799 headers = fp.info()
1800 if filename:
1801 tfp = open(filename, 'wb')
1802 else:
1803 import tempfile
1804 garbage, path = splittype(url)
1805 garbage, path = splithost(path or "")
1806 path, garbage = splitquery(path or "")
1807 path, garbage = splitattr(path or "")
1808 suffix = os.path.splitext(path)[1]
1809 (fd, filename) = tempfile.mkstemp(suffix)
1810 self.__tempfiles.append(filename)
1811 tfp = os.fdopen(fd, 'wb')
1812 try:
1813 result = filename, headers
1814 if self.tempcache is not None:
1815 self.tempcache[url] = result
1816 bs = 1024*8
1817 size = -1
1818 read = 0
1819 blocknum = 0
Senthil Kumarance260142011-11-01 01:35:17 +08001820 if "content-length" in headers:
1821 size = int(headers["Content-Length"])
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001822 if reporthook:
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001823 reporthook(blocknum, bs, size)
1824 while 1:
1825 block = fp.read(bs)
1826 if not block:
1827 break
1828 read += len(block)
1829 tfp.write(block)
1830 blocknum += 1
1831 if reporthook:
1832 reporthook(blocknum, bs, size)
1833 finally:
1834 tfp.close()
1835 finally:
1836 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001837
1838 # raise exception if actual size does not match content-length header
1839 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001840 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001841 "retrieval incomplete: got only %i out of %i bytes"
1842 % (read, size), result)
1843
1844 return result
1845
1846 # Each method named open_<type> knows how to open that type of URL
1847
1848 def _open_generic_http(self, connection_factory, url, data):
1849 """Make an HTTP connection using connection_class.
1850
1851 This is an internal method that should be called from
1852 open_http() or open_https().
1853
1854 Arguments:
1855 - connection_factory should take a host name and return an
1856 HTTPConnection instance.
1857 - url is the url to retrieval or a host, relative-path pair.
1858 - data is payload for a POST request or None.
1859 """
1860
1861 user_passwd = None
1862 proxy_passwd= None
1863 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001864 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001865 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001866 user_passwd, host = splituser(host)
1867 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001868 realhost = host
1869 else:
1870 host, selector = url
1871 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001872 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001873 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001874 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001875 url = rest
1876 user_passwd = None
1877 if urltype.lower() != 'http':
1878 realhost = None
1879 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001880 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001881 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001882 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001883 if user_passwd:
1884 selector = "%s://%s%s" % (urltype, realhost, rest)
1885 if proxy_bypass(realhost):
1886 host = realhost
1887
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001888 if not host: raise OSError('http error', 'no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001889
1890 if proxy_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001891 proxy_passwd = unquote(proxy_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001892 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001893 else:
1894 proxy_auth = None
1895
1896 if user_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001897 user_passwd = unquote(user_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001898 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001899 else:
1900 auth = None
1901 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001902 headers = {}
1903 if proxy_auth:
1904 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1905 if auth:
1906 headers["Authorization"] = "Basic %s" % auth
1907 if realhost:
1908 headers["Host"] = realhost
Senthil Kumarand91ffca2011-03-19 17:25:27 +08001909
1910 # Add Connection:close as we don't support persistent connections yet.
1911 # This helps in closing the socket and avoiding ResourceWarning
1912
1913 headers["Connection"] = "close"
1914
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001915 for header, value in self.addheaders:
1916 headers[header] = value
1917
1918 if data is not None:
1919 headers["Content-Type"] = "application/x-www-form-urlencoded"
1920 http_conn.request("POST", selector, data, headers)
1921 else:
1922 http_conn.request("GET", selector, headers=headers)
1923
1924 try:
1925 response = http_conn.getresponse()
1926 except http.client.BadStatusLine:
1927 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001928 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001929
1930 # According to RFC 2616, "2xx" code indicates that the client's
1931 # request was successfully received, understood, and accepted.
1932 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001933 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001934 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001935 else:
1936 return self.http_error(
1937 url, response.fp,
1938 response.status, response.reason, response.msg, data)
1939
1940 def open_http(self, url, data=None):
1941 """Use HTTP protocol."""
1942 return self._open_generic_http(http.client.HTTPConnection, url, data)
1943
1944 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1945 """Handle http errors.
1946
1947 Derived class can override this, or provide specific handlers
1948 named http_error_DDD where DDD is the 3-digit error code."""
1949 # First check if there's a specific handler for this error
1950 name = 'http_error_%d' % errcode
1951 if hasattr(self, name):
1952 method = getattr(self, name)
1953 if data is None:
1954 result = method(url, fp, errcode, errmsg, headers)
1955 else:
1956 result = method(url, fp, errcode, errmsg, headers, data)
1957 if result: return result
1958 return self.http_error_default(url, fp, errcode, errmsg, headers)
1959
1960 def http_error_default(self, url, fp, errcode, errmsg, headers):
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001961 """Default error handler: close the connection and raise OSError."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001962 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001963 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001964
1965 if _have_ssl:
1966 def _https_connection(self, host):
1967 return http.client.HTTPSConnection(host,
1968 key_file=self.key_file,
1969 cert_file=self.cert_file)
1970
1971 def open_https(self, url, data=None):
1972 """Use HTTPS protocol."""
1973 return self._open_generic_http(self._https_connection, url, data)
1974
1975 def open_file(self, url):
1976 """Use local file or FTP depending on form of URL."""
1977 if not isinstance(url, str):
Senthil Kumaran3ebef362012-10-21 18:31:25 -07001978 raise URLError('file error: proxy support for file protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001979 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001980 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001981 else:
1982 return self.open_local_file(url)
1983
1984 def open_local_file(self, url):
1985 """Use local file."""
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001986 import email.utils
1987 import mimetypes
Georg Brandl13e89462008-07-01 19:56:00 +00001988 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001989 localname = url2pathname(file)
1990 try:
1991 stats = os.stat(localname)
1992 except OSError as e:
Senthil Kumaranf5776862012-10-21 13:30:02 -07001993 raise URLError(e.strerror, e.filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001994 size = stats.st_size
1995 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1996 mtype = mimetypes.guess_type(url)[0]
1997 headers = email.message_from_string(
1998 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1999 (mtype or 'text/plain', size, modified))
2000 if not host:
2001 urlfile = file
2002 if file[:1] == '/':
2003 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00002004 return addinfourl(open(localname, 'rb'), headers, urlfile)
2005 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002006 if (not port
Senthil Kumaran40d80782012-10-22 09:43:04 -07002007 and socket.gethostbyname(host) in ((localhost(),) + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002008 urlfile = file
2009 if file[:1] == '/':
2010 urlfile = 'file://' + file
Senthil Kumaran3800ea92012-01-21 11:52:48 +08002011 elif file[:2] == './':
2012 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
Georg Brandl13e89462008-07-01 19:56:00 +00002013 return addinfourl(open(localname, 'rb'), headers, urlfile)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002014 raise URLError('local file error: not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002015
2016 def open_ftp(self, url):
2017 """Use FTP protocol."""
2018 if not isinstance(url, str):
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002019 raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002020 import mimetypes
Georg Brandl13e89462008-07-01 19:56:00 +00002021 host, path = splithost(url)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002022 if not host: raise URLError('ftp error: no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00002023 host, port = splitport(host)
2024 user, host = splituser(host)
2025 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002026 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00002027 host = unquote(host)
2028 user = unquote(user or '')
2029 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002030 host = socket.gethostbyname(host)
2031 if not port:
2032 import ftplib
2033 port = ftplib.FTP_PORT
2034 else:
2035 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00002036 path, attrs = splitattr(path)
2037 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002038 dirs = path.split('/')
2039 dirs, file = dirs[:-1], dirs[-1]
2040 if dirs and not dirs[0]: dirs = dirs[1:]
2041 if dirs and not dirs[0]: dirs[0] = '/'
2042 key = user, host, port, '/'.join(dirs)
2043 # XXX thread unsafe!
2044 if len(self.ftpcache) > MAXFTPCACHE:
2045 # Prune the cache, rather arbitrarily
Benjamin Peterson3c2dca62014-06-07 15:08:04 -07002046 for k in list(self.ftpcache):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002047 if k != key:
2048 v = self.ftpcache[k]
2049 del self.ftpcache[k]
2050 v.close()
2051 try:
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002052 if key not in self.ftpcache:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002053 self.ftpcache[key] = \
2054 ftpwrapper(user, passwd, host, port, dirs)
2055 if not file: type = 'D'
2056 else: type = 'I'
2057 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00002058 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002059 if attr.lower() == 'type' and \
2060 value in ('a', 'A', 'i', 'I', 'd', 'D'):
2061 type = value.upper()
2062 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2063 mtype = mimetypes.guess_type("ftp:" + url)[0]
2064 headers = ""
2065 if mtype:
2066 headers += "Content-Type: %s\n" % mtype
2067 if retrlen is not None and retrlen >= 0:
2068 headers += "Content-Length: %d\n" % retrlen
2069 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00002070 return addinfourl(fp, headers, "ftp:" + url)
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002071 except ftperrors() as exp:
2072 raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002073
2074 def open_data(self, url, data=None):
2075 """Use "data" URL."""
2076 if not isinstance(url, str):
Senthil Kumaran3ebef362012-10-21 18:31:25 -07002077 raise URLError('data error: proxy support for data protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002078 # ignore POSTed data
2079 #
2080 # syntax of data URLs:
2081 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
2082 # mediatype := [ type "/" subtype ] *( ";" parameter )
2083 # data := *urlchar
2084 # parameter := attribute "=" value
2085 try:
2086 [type, data] = url.split(',', 1)
2087 except ValueError:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002088 raise OSError('data error', 'bad data URL')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002089 if not type:
2090 type = 'text/plain;charset=US-ASCII'
2091 semi = type.rfind(';')
2092 if semi >= 0 and '=' not in type[semi:]:
2093 encoding = type[semi+1:]
2094 type = type[:semi]
2095 else:
2096 encoding = ''
2097 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00002098 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002099 time.gmtime(time.time())))
2100 msg.append('Content-type: %s' % type)
2101 if encoding == 'base64':
Georg Brandl706824f2009-06-04 09:42:55 +00002102 # XXX is this encoding/decoding ok?
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002103 data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002104 else:
Georg Brandl13e89462008-07-01 19:56:00 +00002105 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002106 msg.append('Content-Length: %d' % len(data))
2107 msg.append('')
2108 msg.append(data)
2109 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00002110 headers = email.message_from_string(msg)
2111 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002112 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00002113 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002114
2115
2116class FancyURLopener(URLopener):
2117 """Derived class with handlers for errors we can handle (perhaps)."""
2118
2119 def __init__(self, *args, **kwargs):
2120 URLopener.__init__(self, *args, **kwargs)
2121 self.auth_cache = {}
2122 self.tries = 0
2123 self.maxtries = 10
2124
2125 def http_error_default(self, url, fp, errcode, errmsg, headers):
2126 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00002127 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002128
2129 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2130 """Error 302 -- relocated (temporarily)."""
2131 self.tries += 1
Martin Pantera0370222016-02-04 06:01:35 +00002132 try:
2133 if self.maxtries and self.tries >= self.maxtries:
2134 if hasattr(self, "http_error_500"):
2135 meth = self.http_error_500
2136 else:
2137 meth = self.http_error_default
2138 return meth(url, fp, 500,
2139 "Internal Server Error: Redirect Recursion",
2140 headers)
2141 result = self.redirect_internal(url, fp, errcode, errmsg,
2142 headers, data)
2143 return result
2144 finally:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002145 self.tries = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002146
2147 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2148 if 'location' in headers:
2149 newurl = headers['location']
2150 elif 'uri' in headers:
2151 newurl = headers['uri']
2152 else:
2153 return
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002154 fp.close()
guido@google.coma119df92011-03-29 11:41:02 -07002155
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002156 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00002157 newurl = urljoin(self.type + ":" + url, newurl)
guido@google.coma119df92011-03-29 11:41:02 -07002158
2159 urlparts = urlparse(newurl)
2160
2161 # For security reasons, we don't allow redirection to anything other
2162 # than http, https and ftp.
2163
2164 # We are using newer HTTPError with older redirect_internal method
2165 # This older method will get deprecated in 3.3
2166
Senthil Kumaran6497aa32012-01-04 13:46:59 +08002167 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
guido@google.coma119df92011-03-29 11:41:02 -07002168 raise HTTPError(newurl, errcode,
2169 errmsg +
2170 " Redirection to url '%s' is not allowed." % newurl,
2171 headers, fp)
2172
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002173 return self.open(newurl)
2174
2175 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2176 """Error 301 -- also relocated (permanently)."""
2177 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2178
2179 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2180 """Error 303 -- also relocated (essentially identical to 302)."""
2181 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2182
2183 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2184 """Error 307 -- relocated, but turn POST into error."""
2185 if data is None:
2186 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2187 else:
2188 return self.http_error_default(url, fp, errcode, errmsg, headers)
2189
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002190 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2191 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002192 """Error 401 -- authentication required.
2193 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002194 if 'www-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002195 URLopener.http_error_default(self, url, fp,
2196 errcode, errmsg, headers)
2197 stuff = headers['www-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002198 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2199 if not match:
2200 URLopener.http_error_default(self, url, fp,
2201 errcode, errmsg, headers)
2202 scheme, realm = match.groups()
2203 if scheme.lower() != 'basic':
2204 URLopener.http_error_default(self, url, fp,
2205 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002206 if not retry:
2207 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2208 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002209 name = 'retry_' + self.type + '_basic_auth'
2210 if data is None:
2211 return getattr(self,name)(url, realm)
2212 else:
2213 return getattr(self,name)(url, realm, data)
2214
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002215 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2216 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002217 """Error 407 -- proxy authentication required.
2218 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002219 if 'proxy-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002220 URLopener.http_error_default(self, url, fp,
2221 errcode, errmsg, headers)
2222 stuff = headers['proxy-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002223 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2224 if not match:
2225 URLopener.http_error_default(self, url, fp,
2226 errcode, errmsg, headers)
2227 scheme, realm = match.groups()
2228 if scheme.lower() != 'basic':
2229 URLopener.http_error_default(self, url, fp,
2230 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002231 if not retry:
2232 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2233 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002234 name = 'retry_proxy_' + self.type + '_basic_auth'
2235 if data is None:
2236 return getattr(self,name)(url, realm)
2237 else:
2238 return getattr(self,name)(url, realm, data)
2239
2240 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002241 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002242 newurl = 'http://' + host + selector
2243 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00002244 urltype, proxyhost = splittype(proxy)
2245 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002246 i = proxyhost.find('@') + 1
2247 proxyhost = proxyhost[i:]
2248 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2249 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002250 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002251 quote(passwd, safe=''), proxyhost)
2252 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2253 if data is None:
2254 return self.open(newurl)
2255 else:
2256 return self.open(newurl, data)
2257
2258 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002259 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002260 newurl = 'https://' + host + selector
2261 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00002262 urltype, proxyhost = splittype(proxy)
2263 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002264 i = proxyhost.find('@') + 1
2265 proxyhost = proxyhost[i:]
2266 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2267 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002268 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002269 quote(passwd, safe=''), proxyhost)
2270 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2271 if data is None:
2272 return self.open(newurl)
2273 else:
2274 return self.open(newurl, data)
2275
2276 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002277 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002278 i = host.find('@') + 1
2279 host = host[i:]
2280 user, passwd = self.get_user_passwd(host, realm, i)
2281 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002282 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002283 quote(passwd, safe=''), host)
2284 newurl = 'http://' + host + selector
2285 if data is None:
2286 return self.open(newurl)
2287 else:
2288 return self.open(newurl, data)
2289
2290 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002291 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002292 i = host.find('@') + 1
2293 host = host[i:]
2294 user, passwd = self.get_user_passwd(host, realm, i)
2295 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002296 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002297 quote(passwd, safe=''), host)
2298 newurl = 'https://' + host + selector
2299 if data is None:
2300 return self.open(newurl)
2301 else:
2302 return self.open(newurl, data)
2303
Florent Xicluna757445b2010-05-17 17:24:07 +00002304 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002305 key = realm + '@' + host.lower()
2306 if key in self.auth_cache:
2307 if clear_cache:
2308 del self.auth_cache[key]
2309 else:
2310 return self.auth_cache[key]
2311 user, passwd = self.prompt_user_passwd(host, realm)
2312 if user or passwd: self.auth_cache[key] = (user, passwd)
2313 return user, passwd
2314
2315 def prompt_user_passwd(self, host, realm):
2316 """Override this in a GUI environment!"""
2317 import getpass
2318 try:
2319 user = input("Enter username for %s at %s: " % (realm, host))
2320 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2321 (user, realm, host))
2322 return user, passwd
2323 except KeyboardInterrupt:
2324 print()
2325 return None, None
2326
2327
2328# Utility functions
2329
2330_localhost = None
2331def localhost():
2332 """Return the IP address of the magic hostname 'localhost'."""
2333 global _localhost
2334 if _localhost is None:
2335 _localhost = socket.gethostbyname('localhost')
2336 return _localhost
2337
2338_thishost = None
2339def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002340 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002341 global _thishost
2342 if _thishost is None:
Senthil Kumarandcdadfe2013-06-01 11:12:17 -07002343 try:
2344 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2345 except socket.gaierror:
2346 _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002347 return _thishost
2348
2349_ftperrors = None
2350def ftperrors():
2351 """Return the set of errors raised by the FTP class."""
2352 global _ftperrors
2353 if _ftperrors is None:
2354 import ftplib
2355 _ftperrors = ftplib.all_errors
2356 return _ftperrors
2357
2358_noheaders = None
2359def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002360 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002361 global _noheaders
2362 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002363 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002364 return _noheaders
2365
2366
2367# Utility classes
2368
2369class ftpwrapper:
2370 """Class used by open_ftp() for cache of open FTP connections."""
2371
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002372 def __init__(self, user, passwd, host, port, dirs, timeout=None,
2373 persistent=True):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002374 self.user = user
2375 self.passwd = passwd
2376 self.host = host
2377 self.port = port
2378 self.dirs = dirs
2379 self.timeout = timeout
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002380 self.refcount = 0
2381 self.keepalive = persistent
Victor Stinnerab73e652015-04-07 12:49:27 +02002382 try:
2383 self.init()
2384 except:
2385 self.close()
2386 raise
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002387
2388 def init(self):
2389 import ftplib
2390 self.busy = 0
2391 self.ftp = ftplib.FTP()
2392 self.ftp.connect(self.host, self.port, self.timeout)
2393 self.ftp.login(self.user, self.passwd)
Senthil Kumarancaa00fe2013-06-02 11:59:47 -07002394 _target = '/'.join(self.dirs)
2395 self.ftp.cwd(_target)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002396
2397 def retrfile(self, file, type):
2398 import ftplib
2399 self.endtransfer()
2400 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2401 else: cmd = 'TYPE ' + type; isdir = 0
2402 try:
2403 self.ftp.voidcmd(cmd)
2404 except ftplib.all_errors:
2405 self.init()
2406 self.ftp.voidcmd(cmd)
2407 conn = None
2408 if file and not isdir:
2409 # Try to retrieve as a file
2410 try:
2411 cmd = 'RETR ' + file
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002412 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002413 except ftplib.error_perm as reason:
2414 if str(reason)[:3] != '550':
Benjamin Peterson901a2782013-05-12 19:01:52 -05002415 raise URLError('ftp error: %r' % reason).with_traceback(
Georg Brandl13e89462008-07-01 19:56:00 +00002416 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002417 if not conn:
2418 # Set transfer mode to ASCII!
2419 self.ftp.voidcmd('TYPE A')
2420 # Try a directory listing. Verify that directory exists.
2421 if file:
2422 pwd = self.ftp.pwd()
2423 try:
2424 try:
2425 self.ftp.cwd(file)
2426 except ftplib.error_perm as reason:
Benjamin Peterson901a2782013-05-12 19:01:52 -05002427 raise URLError('ftp error: %r' % reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002428 finally:
2429 self.ftp.cwd(pwd)
2430 cmd = 'LIST ' + file
2431 else:
2432 cmd = 'LIST'
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002433 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002434 self.busy = 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002435
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002436 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2437 self.refcount += 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002438 conn.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002439 # Pass back both a suitably decorated object and a retrieval length
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002440 return (ftpobj, retrlen)
2441
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002442 def endtransfer(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002443 self.busy = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002444
2445 def close(self):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002446 self.keepalive = False
2447 if self.refcount <= 0:
2448 self.real_close()
2449
2450 def file_close(self):
2451 self.endtransfer()
2452 self.refcount -= 1
2453 if self.refcount <= 0 and not self.keepalive:
2454 self.real_close()
2455
2456 def real_close(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002457 self.endtransfer()
2458 try:
2459 self.ftp.close()
2460 except ftperrors():
2461 pass
2462
2463# Proxy handling
2464def getproxies_environment():
2465 """Return a dictionary of scheme -> proxy server URL mappings.
2466
2467 Scan the environment for variables named <scheme>_proxy;
2468 this seems to be the standard convention. If you need a
2469 different way, you can pass a proxies dictionary to the
2470 [Fancy]URLopener constructor.
2471
2472 """
2473 proxies = {}
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002474 # in order to prefer lowercase variables, process environment in
2475 # two passes: first matches any, second pass matches lowercase only
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002476 for name, value in os.environ.items():
2477 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002478 if value and name[-6:] == '_proxy':
2479 proxies[name[:-6]] = value
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002480 for name, value in os.environ.items():
2481 if name[-6:] == '_proxy':
2482 name = name.lower()
2483 if value:
2484 proxies[name[:-6]] = value
2485 else:
2486 proxies.pop(name[:-6], None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002487 return proxies
2488
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002489def proxy_bypass_environment(host, proxies=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002490 """Test if proxies should not be used for a particular host.
2491
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002492 Checks the proxy dict for the value of no_proxy, which should
2493 be a list of comma separated DNS suffixes, or '*' for all hosts.
2494
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002495 """
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002496 if proxies is None:
2497 proxies = getproxies_environment()
2498 # don't bypass, if no_proxy isn't specified
2499 try:
2500 no_proxy = proxies['no']
2501 except KeyError:
2502 return 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002503 # '*' is special case for always bypass
2504 if no_proxy == '*':
2505 return 1
2506 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002507 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002508 # check if the host ends with any of the DNS suffixes
Senthil Kumaran89976f12011-08-06 12:27:40 +08002509 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2510 for name in no_proxy_list:
Martin Panteraa279822016-04-30 01:03:40 +00002511 if name:
2512 name = re.escape(name)
2513 pattern = r'(.+\.)?%s$' % name
2514 if (re.match(pattern, hostonly, re.I)
2515 or re.match(pattern, host, re.I)):
2516 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002517 # otherwise, don't bypass
2518 return 0
2519
2520
Ronald Oussorene72e1612011-03-14 18:15:25 -04002521# This code tests an OSX specific data structure but is testable on all
2522# platforms
2523def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2524 """
2525 Return True iff this host shouldn't be accessed using a proxy
2526
2527 This function uses the MacOSX framework SystemConfiguration
2528 to fetch the proxy information.
2529
2530 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2531 { 'exclude_simple': bool,
2532 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2533 }
2534 """
Ronald Oussorene72e1612011-03-14 18:15:25 -04002535 from fnmatch import fnmatch
2536
2537 hostonly, port = splitport(host)
2538
2539 def ip2num(ipAddr):
2540 parts = ipAddr.split('.')
2541 parts = list(map(int, parts))
2542 if len(parts) != 4:
2543 parts = (parts + [0, 0, 0, 0])[:4]
2544 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2545
2546 # Check for simple host names:
2547 if '.' not in host:
2548 if proxy_settings['exclude_simple']:
2549 return True
2550
2551 hostIP = None
2552
2553 for value in proxy_settings.get('exceptions', ()):
2554 # Items in the list are strings like these: *.local, 169.254/16
2555 if not value: continue
2556
2557 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2558 if m is not None:
2559 if hostIP is None:
2560 try:
2561 hostIP = socket.gethostbyname(hostonly)
2562 hostIP = ip2num(hostIP)
Andrew Svetlov0832af62012-12-18 23:10:48 +02002563 except OSError:
Ronald Oussorene72e1612011-03-14 18:15:25 -04002564 continue
2565
2566 base = ip2num(m.group(1))
2567 mask = m.group(2)
2568 if mask is None:
2569 mask = 8 * (m.group(1).count('.') + 1)
2570 else:
2571 mask = int(mask[1:])
2572 mask = 32 - mask
2573
2574 if (hostIP >> mask) == (base >> mask):
2575 return True
2576
2577 elif fnmatch(host, value):
2578 return True
2579
2580 return False
2581
2582
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002583if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002584 from _scproxy import _get_proxy_settings, _get_proxies
2585
2586 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren84151202010-04-18 20:46:11 +00002587 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002588 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren84151202010-04-18 20:46:11 +00002589
2590 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002591 """Return a dictionary of scheme -> proxy server URL mappings.
2592
Ronald Oussoren84151202010-04-18 20:46:11 +00002593 This function uses the MacOSX framework SystemConfiguration
2594 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002595 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002596 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002597
Ronald Oussoren84151202010-04-18 20:46:11 +00002598
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002599
2600 def proxy_bypass(host):
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002601 """Return True, if host should be bypassed.
2602
2603 Checks proxy settings gathered from the environment, if specified,
2604 or from the MacOSX framework SystemConfiguration.
2605
2606 """
2607 proxies = getproxies_environment()
2608 if proxies:
2609 return proxy_bypass_environment(host, proxies)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002610 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002611 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002612
2613 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002614 return getproxies_environment() or getproxies_macosx_sysconf()
2615
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002616
2617elif os.name == 'nt':
2618 def getproxies_registry():
2619 """Return a dictionary of scheme -> proxy server URL mappings.
2620
2621 Win32 uses the registry to store proxies.
2622
2623 """
2624 proxies = {}
2625 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002626 import winreg
Brett Cannoncd171c82013-07-04 17:43:24 -04002627 except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002628 # Std module, so should be around - but you never know!
2629 return proxies
2630 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002631 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002632 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002633 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002634 'ProxyEnable')[0]
2635 if proxyEnable:
2636 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002637 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002638 'ProxyServer')[0])
2639 if '=' in proxyServer:
2640 # Per-protocol settings
2641 for p in proxyServer.split(';'):
2642 protocol, address = p.split('=', 1)
2643 # See if address has a type:// prefix
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002644 if not re.match('^([^/:]+)://', address):
2645 address = '%s://%s' % (protocol, address)
2646 proxies[protocol] = address
2647 else:
2648 # Use one setting for all protocols
2649 if proxyServer[:5] == 'http:':
2650 proxies['http'] = proxyServer
2651 else:
2652 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002653 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002654 proxies['ftp'] = 'ftp://%s' % proxyServer
2655 internetSettings.Close()
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002656 except (OSError, ValueError, TypeError):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002657 # Either registry key not found etc, or the value in an
2658 # unexpected format.
2659 # proxies already set up to be empty so nothing to do
2660 pass
2661 return proxies
2662
2663 def getproxies():
2664 """Return a dictionary of scheme -> proxy server URL mappings.
2665
2666 Returns settings gathered from the environment, if specified,
2667 or the registry.
2668
2669 """
2670 return getproxies_environment() or getproxies_registry()
2671
2672 def proxy_bypass_registry(host):
2673 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002674 import winreg
Brett Cannoncd171c82013-07-04 17:43:24 -04002675 except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002676 # Std modules, so should be around - but you never know!
2677 return 0
2678 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002679 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002680 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002681 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002682 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002683 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002684 'ProxyOverride')[0])
2685 # ^^^^ Returned as Unicode but problems if not converted to ASCII
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002686 except OSError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002687 return 0
2688 if not proxyEnable or not proxyOverride:
2689 return 0
2690 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002691 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002692 host = [rawHost]
2693 try:
2694 addr = socket.gethostbyname(rawHost)
2695 if addr != rawHost:
2696 host.append(addr)
Andrew Svetlov0832af62012-12-18 23:10:48 +02002697 except OSError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002698 pass
2699 try:
2700 fqdn = socket.getfqdn(rawHost)
2701 if fqdn != rawHost:
2702 host.append(fqdn)
Andrew Svetlov0832af62012-12-18 23:10:48 +02002703 except OSError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002704 pass
2705 # make a check value list from the registry entry: replace the
2706 # '<local>' string by the localhost entry and the corresponding
2707 # canonical entry.
2708 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002709 # now check if we match one of the registry values.
2710 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002711 if test == '<local>':
2712 if '.' not in rawHost:
2713 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002714 test = test.replace(".", r"\.") # mask dots
2715 test = test.replace("*", r".*") # change glob sequence
2716 test = test.replace("?", r".") # change glob char
2717 for val in host:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002718 if re.match(test, val, re.I):
2719 return 1
2720 return 0
2721
2722 def proxy_bypass(host):
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002723 """Return True, if host should be bypassed.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002724
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002725 Checks proxy settings gathered from the environment, if specified,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002726 or the registry.
2727
2728 """
Senthil Kumarana7c0ff22016-04-25 08:16:23 -07002729 proxies = getproxies_environment()
2730 if proxies:
2731 return proxy_bypass_environment(host, proxies)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002732 else:
2733 return proxy_bypass_registry(host)
2734
2735else:
2736 # By default use environment variables
2737 getproxies = getproxies_environment
2738 proxy_bypass = proxy_bypass_environment