blob: 3a472f235f63a1de93342e9563f7632f098ce1cc [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
Jeremy Hylton1afc1692008-06-18 20:49:58 +000092import re
93import socket
94import sys
95import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000096import collections
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097
Georg Brandl13e89462008-07-01 19:56:00 +000098from urllib.error import URLError, HTTPError, ContentTooShortError
99from urllib.parse import (
100 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
101 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000102 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000103from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000104
105# check for SSL
106try:
107 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000108except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000109 _have_ssl = False
110else:
111 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000112
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800113__all__ = [
114 # Classes
115 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
116 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
117 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
118 'AbstractBasicAuthHandler', 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler',
119 'AbstractDigestAuthHandler', 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler',
120 'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler',
121 'UnknownHandler', 'HTTPErrorProcessor',
122 # Functions
123 'urlopen', 'install_opener', 'build_opener',
124 'pathname2url', 'url2pathname', 'getproxies',
125 # Legacy interface
126 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
127]
128
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000129# used in User-Agent header sent
130__version__ = sys.version[:3]
131
132_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000133def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
134 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000135 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000136 if cafile or capath:
137 if not _have_ssl:
138 raise ValueError('SSL support not available')
139 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
140 context.options |= ssl.OP_NO_SSLv2
141 if cafile or capath:
142 context.verify_mode = ssl.CERT_REQUIRED
143 context.load_verify_locations(cafile, capath)
144 check_hostname = True
145 else:
146 check_hostname = False
147 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
148 opener = build_opener(https_handler)
149 elif _opener is None:
150 _opener = opener = build_opener()
151 else:
152 opener = _opener
153 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000154
155def install_opener(opener):
156 global _opener
157 _opener = opener
158
159# TODO(jhylton): Make this work with the same global opener.
160_urlopener = None
161def urlretrieve(url, filename=None, reporthook=None, data=None):
162 global _urlopener
163 if not _urlopener:
164 _urlopener = FancyURLopener()
165 return _urlopener.retrieve(url, filename, reporthook, data)
166
167def urlcleanup():
168 if _urlopener:
169 _urlopener.cleanup()
170 global _opener
171 if _opener:
172 _opener = None
173
174# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000175_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000176def request_host(request):
177 """Return request-host, as defined by RFC 2965.
178
179 Variation from RFC: returned value is lowercased, for convenient
180 comparison.
181
182 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000183 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000184 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185 if host == "":
186 host = request.get_header("Host", "")
187
188 # remove port, if present
189 host = _cut_port_re.sub("", host, 1)
190 return host.lower()
191
192class Request:
193
194 def __init__(self, url, data=None, headers={},
Senthil Kumarande49d642011-10-16 23:54:44 +0800195 origin_req_host=None, unverifiable=False,
196 method=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000197 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000198 self.full_url = unwrap(url)
Senthil Kumaran26430412011-04-13 07:01:19 +0800199 self.full_url, self.fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000200 self.data = data
201 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000202 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000203 for key, value in headers.items():
204 self.add_header(key, value)
205 self.unredirected_hdrs = {}
206 if origin_req_host is None:
207 origin_req_host = request_host(self)
208 self.origin_req_host = origin_req_host
209 self.unverifiable = unverifiable
Senthil Kumarande49d642011-10-16 23:54:44 +0800210 self.method = method
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000211 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000212
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000213 def _parse(self):
214 self.type, rest = splittype(self.full_url)
215 if self.type is None:
216 raise ValueError("unknown url type: %s" % self.full_url)
217 self.host, self.selector = splithost(rest)
218 if self.host:
219 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000220
221 def get_method(self):
Senthil Kumarande49d642011-10-16 23:54:44 +0800222 """Return a string indicating the HTTP request method."""
223 if self.method is not None:
224 return self.method
225 elif self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000226 return "POST"
227 else:
228 return "GET"
229
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000230 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000231
232 def add_data(self, data):
233 self.data = data
234
235 def has_data(self):
236 return self.data is not None
237
238 def get_data(self):
239 return self.data
240
241 def get_full_url(self):
Senthil Kumaran26430412011-04-13 07:01:19 +0800242 if self.fragment:
243 return '%s#%s' % (self.full_url, self.fragment)
244 else:
245 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000246
247 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000248 return self.type
249
250 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000251 return self.host
252
253 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000254 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000255
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000256 def is_unverifiable(self):
257 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000258
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000259 def get_origin_req_host(self):
260 return self.origin_req_host
261
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000262 # End deprecated methods
263
264 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000265 if self.type == 'https' and not self._tunnel_host:
266 self._tunnel_host = self.host
267 else:
268 self.type= type
269 self.selector = self.full_url
270 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000271
272 def has_proxy(self):
273 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000274
275 def add_header(self, key, val):
276 # useful for something like authentication
277 self.headers[key.capitalize()] = val
278
279 def add_unredirected_header(self, key, val):
280 # will not be added to a redirected request
281 self.unredirected_hdrs[key.capitalize()] = val
282
283 def has_header(self, header_name):
284 return (header_name in self.headers or
285 header_name in self.unredirected_hdrs)
286
287 def get_header(self, header_name, default=None):
288 return self.headers.get(
289 header_name,
290 self.unredirected_hdrs.get(header_name, default))
291
292 def header_items(self):
293 hdrs = self.unredirected_hdrs.copy()
294 hdrs.update(self.headers)
295 return list(hdrs.items())
296
297class OpenerDirector:
298 def __init__(self):
299 client_version = "Python-urllib/%s" % __version__
300 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000301 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000302 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000303 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000304 self.handle_open = {}
305 self.handle_error = {}
306 self.process_response = {}
307 self.process_request = {}
308
309 def add_handler(self, handler):
310 if not hasattr(handler, "add_parent"):
311 raise TypeError("expected BaseHandler instance, got %r" %
312 type(handler))
313
314 added = False
315 for meth in dir(handler):
316 if meth in ["redirect_request", "do_open", "proxy_open"]:
317 # oops, coincidental match
318 continue
319
320 i = meth.find("_")
321 protocol = meth[:i]
322 condition = meth[i+1:]
323
324 if condition.startswith("error"):
325 j = condition.find("_") + i + 1
326 kind = meth[j+1:]
327 try:
328 kind = int(kind)
329 except ValueError:
330 pass
331 lookup = self.handle_error.get(protocol, {})
332 self.handle_error[protocol] = lookup
333 elif condition == "open":
334 kind = protocol
335 lookup = self.handle_open
336 elif condition == "response":
337 kind = protocol
338 lookup = self.process_response
339 elif condition == "request":
340 kind = protocol
341 lookup = self.process_request
342 else:
343 continue
344
345 handlers = lookup.setdefault(kind, [])
346 if handlers:
347 bisect.insort(handlers, handler)
348 else:
349 handlers.append(handler)
350 added = True
351
352 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000353 bisect.insort(self.handlers, handler)
354 handler.add_parent(self)
355
356 def close(self):
357 # Only exists for backwards compatibility.
358 pass
359
360 def _call_chain(self, chain, kind, meth_name, *args):
361 # Handlers raise an exception if no one else should try to handle
362 # the request, or return None if they can't but another handler
363 # could. Otherwise, they return the response.
364 handlers = chain.get(kind, ())
365 for handler in handlers:
366 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000367 result = func(*args)
368 if result is not None:
369 return result
370
371 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
372 # accept a URL or a Request object
373 if isinstance(fullurl, str):
374 req = Request(fullurl, data)
375 else:
376 req = fullurl
377 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000378 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000379
380 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000381 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000382
383 # pre-process request
384 meth_name = protocol+"_request"
385 for processor in self.process_request.get(protocol, []):
386 meth = getattr(processor, meth_name)
387 req = meth(req)
388
389 response = self._open(req, data)
390
391 # post-process response
392 meth_name = protocol+"_response"
393 for processor in self.process_response.get(protocol, []):
394 meth = getattr(processor, meth_name)
395 response = meth(req, response)
396
397 return response
398
399 def _open(self, req, data=None):
400 result = self._call_chain(self.handle_open, 'default',
401 'default_open', req)
402 if result:
403 return result
404
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000405 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000406 result = self._call_chain(self.handle_open, protocol, protocol +
407 '_open', req)
408 if result:
409 return result
410
411 return self._call_chain(self.handle_open, 'unknown',
412 'unknown_open', req)
413
414 def error(self, proto, *args):
415 if proto in ('http', 'https'):
416 # XXX http[s] protocols are special-cased
417 dict = self.handle_error['http'] # https is not different than http
418 proto = args[2] # YUCK!
419 meth_name = 'http_error_%s' % proto
420 http_err = 1
421 orig_args = args
422 else:
423 dict = self.handle_error
424 meth_name = proto + '_error'
425 http_err = 0
426 args = (dict, proto, meth_name) + args
427 result = self._call_chain(*args)
428 if result:
429 return result
430
431 if http_err:
432 args = (dict, 'default', 'http_error_default') + orig_args
433 return self._call_chain(*args)
434
435# XXX probably also want an abstract factory that knows when it makes
436# sense to skip a superclass in favor of a subclass and when it might
437# make sense to include both
438
439def build_opener(*handlers):
440 """Create an opener object from a list of handlers.
441
442 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000443 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000444
445 If any of the handlers passed as arguments are subclasses of the
446 default handlers, the default handlers will not be used.
447 """
448 def isclass(obj):
449 return isinstance(obj, type) or hasattr(obj, "__bases__")
450
451 opener = OpenerDirector()
452 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
453 HTTPDefaultErrorHandler, HTTPRedirectHandler,
454 FTPHandler, FileHandler, HTTPErrorProcessor]
455 if hasattr(http.client, "HTTPSConnection"):
456 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000457 skip = set()
458 for klass in default_classes:
459 for check in handlers:
460 if isclass(check):
461 if issubclass(check, klass):
462 skip.add(klass)
463 elif isinstance(check, klass):
464 skip.add(klass)
465 for klass in skip:
466 default_classes.remove(klass)
467
468 for klass in default_classes:
469 opener.add_handler(klass())
470
471 for h in handlers:
472 if isclass(h):
473 h = h()
474 opener.add_handler(h)
475 return opener
476
477class BaseHandler:
478 handler_order = 500
479
480 def add_parent(self, parent):
481 self.parent = parent
482
483 def close(self):
484 # Only exists for backwards compatibility
485 pass
486
487 def __lt__(self, other):
488 if not hasattr(other, "handler_order"):
489 # Try to preserve the old behavior of having custom classes
490 # inserted after default ones (works only for custom user
491 # classes which are not aware of handler_order).
492 return True
493 return self.handler_order < other.handler_order
494
495
496class HTTPErrorProcessor(BaseHandler):
497 """Process HTTP error responses."""
498 handler_order = 1000 # after all other processing
499
500 def http_response(self, request, response):
501 code, msg, hdrs = response.code, response.msg, response.info()
502
503 # According to RFC 2616, "2xx" code indicates that the client's
504 # request was successfully received, understood, and accepted.
505 if not (200 <= code < 300):
506 response = self.parent.error(
507 'http', request, response, code, msg, hdrs)
508
509 return response
510
511 https_response = http_response
512
513class HTTPDefaultErrorHandler(BaseHandler):
514 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516
517class HTTPRedirectHandler(BaseHandler):
518 # maximum number of redirections to any single URL
519 # this is needed because of the state that cookies introduce
520 max_repeats = 4
521 # maximum total number of redirections (regardless of URL) before
522 # assuming we're in a loop
523 max_redirections = 10
524
525 def redirect_request(self, req, fp, code, msg, headers, newurl):
526 """Return a Request or None in response to a redirect.
527
528 This is called by the http_error_30x methods when a
529 redirection response is received. If a redirection should
530 take place, return a new Request to allow http_error_30x to
531 perform the redirect. Otherwise, raise HTTPError if no-one
532 else should try to handle this url. Return None if you can't
533 but another Handler might.
534 """
535 m = req.get_method()
536 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
537 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000538 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000539
540 # Strictly (according to RFC 2616), 301 or 302 in response to
541 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000542 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000543 # essentially all clients do redirect in this case, so we do
544 # the same.
545 # be conciliant with URIs containing a space
546 newurl = newurl.replace(' ', '%20')
547 CONTENT_HEADERS = ("content-length", "content-type")
548 newheaders = dict((k, v) for k, v in req.headers.items()
549 if k.lower() not in CONTENT_HEADERS)
550 return Request(newurl,
551 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000552 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000553 unverifiable=True)
554
555 # Implementation note: To avoid the server sending us into an
556 # infinite loop, the request object needs to track what URLs we
557 # have already seen. Do this by adding a handler-specific
558 # attribute to the Request object.
559 def http_error_302(self, req, fp, code, msg, headers):
560 # Some servers (incorrectly) return multiple Location headers
561 # (so probably same goes for URI). Use first header.
562 if "location" in headers:
563 newurl = headers["location"]
564 elif "uri" in headers:
565 newurl = headers["uri"]
566 else:
567 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000568
569 # fix a possible malformed URL
570 urlparts = urlparse(newurl)
guido@google.coma119df92011-03-29 11:41:02 -0700571
572 # For security reasons we don't allow redirection to anything other
573 # than http, https or ftp.
574
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800575 if urlparts.scheme not in ('http', 'https', 'ftp'):
576 raise HTTPError(
577 newurl, code,
578 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
579 headers, fp)
guido@google.coma119df92011-03-29 11:41:02 -0700580
Facundo Batistaf24802c2008-08-17 03:36:03 +0000581 if not urlparts.path:
582 urlparts = list(urlparts)
583 urlparts[2] = "/"
584 newurl = urlunparse(urlparts)
585
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000586 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000587
588 # XXX Probably want to forget about the state of the current
589 # request, although that might interact poorly with other
590 # handlers that also use handler-specific request attributes
591 new = self.redirect_request(req, fp, code, msg, headers, newurl)
592 if new is None:
593 return
594
595 # loop detection
596 # .redirect_dict has a key url if url was previously visited.
597 if hasattr(req, 'redirect_dict'):
598 visited = new.redirect_dict = req.redirect_dict
599 if (visited.get(newurl, 0) >= self.max_repeats or
600 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000601 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000602 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000603 else:
604 visited = new.redirect_dict = req.redirect_dict = {}
605 visited[newurl] = visited.get(newurl, 0) + 1
606
607 # Don't close the fp until we are sure that we won't use it
608 # with HTTPError.
609 fp.read()
610 fp.close()
611
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000612 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000613
614 http_error_301 = http_error_303 = http_error_307 = http_error_302
615
616 inf_msg = "The HTTP server returned a redirect error that would " \
617 "lead to an infinite loop.\n" \
618 "The last 30x error message was:\n"
619
620
621def _parse_proxy(proxy):
622 """Return (scheme, user, password, host/port) given a URL or an authority.
623
624 If a URL is supplied, it must have an authority (host:port) component.
625 According to RFC 3986, having an authority component means the URL must
626 have two slashes after the scheme:
627
628 >>> _parse_proxy('file:/ftp.example.com/')
629 Traceback (most recent call last):
630 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
631
632 The first three items of the returned tuple may be None.
633
634 Examples of authority parsing:
635
636 >>> _parse_proxy('proxy.example.com')
637 (None, None, None, 'proxy.example.com')
638 >>> _parse_proxy('proxy.example.com:3128')
639 (None, None, None, 'proxy.example.com:3128')
640
641 The authority component may optionally include userinfo (assumed to be
642 username:password):
643
644 >>> _parse_proxy('joe:password@proxy.example.com')
645 (None, 'joe', 'password', 'proxy.example.com')
646 >>> _parse_proxy('joe:password@proxy.example.com:3128')
647 (None, 'joe', 'password', 'proxy.example.com:3128')
648
649 Same examples, but with URLs instead:
650
651 >>> _parse_proxy('http://proxy.example.com/')
652 ('http', None, None, 'proxy.example.com')
653 >>> _parse_proxy('http://proxy.example.com:3128/')
654 ('http', None, None, 'proxy.example.com:3128')
655 >>> _parse_proxy('http://joe:password@proxy.example.com/')
656 ('http', 'joe', 'password', 'proxy.example.com')
657 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
658 ('http', 'joe', 'password', 'proxy.example.com:3128')
659
660 Everything after the authority is ignored:
661
662 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
663 ('ftp', 'joe', 'password', 'proxy.example.com')
664
665 Test for no trailing '/' case:
666
667 >>> _parse_proxy('http://joe:password@proxy.example.com')
668 ('http', 'joe', 'password', 'proxy.example.com')
669
670 """
Georg Brandl13e89462008-07-01 19:56:00 +0000671 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000672 if not r_scheme.startswith("/"):
673 # authority
674 scheme = None
675 authority = proxy
676 else:
677 # URL
678 if not r_scheme.startswith("//"):
679 raise ValueError("proxy URL with no authority: %r" % proxy)
680 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
681 # and 3.3.), path is empty or starts with '/'
682 end = r_scheme.find("/", 2)
683 if end == -1:
684 end = None
685 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000686 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000687 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000688 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000689 else:
690 user = password = None
691 return scheme, user, password, hostport
692
693class ProxyHandler(BaseHandler):
694 # Proxies must be in front
695 handler_order = 100
696
697 def __init__(self, proxies=None):
698 if proxies is None:
699 proxies = getproxies()
700 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
701 self.proxies = proxies
702 for type, url in proxies.items():
703 setattr(self, '%s_open' % type,
704 lambda r, proxy=url, type=type, meth=self.proxy_open: \
705 meth(r, proxy, type))
706
707 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000708 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000709 proxy_type, user, password, hostport = _parse_proxy(proxy)
710 if proxy_type is None:
711 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000712
713 if req.host and proxy_bypass(req.host):
714 return None
715
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000716 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000717 user_pass = '%s:%s' % (unquote(user),
718 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000719 creds = base64.b64encode(user_pass.encode()).decode("ascii")
720 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000721 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000722 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000723 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000724 # let other handlers take care of it
725 return None
726 else:
727 # need to start over, because the other handlers don't
728 # grok the proxy's URL type
729 # e.g. if we have a constructor arg proxies like so:
730 # {'http': 'ftp://proxy.example.com'}, we may end up turning
731 # a request for http://acme.example.com/a into one for
732 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000733 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000734
735class HTTPPasswordMgr:
736
737 def __init__(self):
738 self.passwd = {}
739
740 def add_password(self, realm, uri, user, passwd):
741 # uri could be a single URI or a sequence
742 if isinstance(uri, str):
743 uri = [uri]
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800744 if realm not in self.passwd:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000745 self.passwd[realm] = {}
746 for default_port in True, False:
747 reduced_uri = tuple(
748 [self.reduce_uri(u, default_port) for u in uri])
749 self.passwd[realm][reduced_uri] = (user, passwd)
750
751 def find_user_password(self, realm, authuri):
752 domains = self.passwd.get(realm, {})
753 for default_port in True, False:
754 reduced_authuri = self.reduce_uri(authuri, default_port)
755 for uris, authinfo in domains.items():
756 for uri in uris:
757 if self.is_suburi(uri, reduced_authuri):
758 return authinfo
759 return None, None
760
761 def reduce_uri(self, uri, default_port=True):
762 """Accept authority or URI and extract only the authority and path."""
763 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000764 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000765 if parts[1]:
766 # URI
767 scheme = parts[0]
768 authority = parts[1]
769 path = parts[2] or '/'
770 else:
771 # host or host:port
772 scheme = None
773 authority = uri
774 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000775 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000776 if default_port and port is None and scheme is not None:
777 dport = {"http": 80,
778 "https": 443,
779 }.get(scheme)
780 if dport is not None:
781 authority = "%s:%d" % (host, dport)
782 return authority, path
783
784 def is_suburi(self, base, test):
785 """Check if test is below base in a URI tree
786
787 Both args must be URIs in reduced form.
788 """
789 if base == test:
790 return True
791 if base[0] != test[0]:
792 return False
793 common = posixpath.commonprefix((base[1], test[1]))
794 if len(common) == len(base[1]):
795 return True
796 return False
797
798
799class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
800
801 def find_user_password(self, realm, authuri):
802 user, password = HTTPPasswordMgr.find_user_password(self, realm,
803 authuri)
804 if user is not None:
805 return user, password
806 return HTTPPasswordMgr.find_user_password(self, None, authuri)
807
808
809class AbstractBasicAuthHandler:
810
811 # XXX this allows for multiple auth-schemes, but will stupidly pick
812 # the last one with a realm specified.
813
814 # allow for double- and single-quoted realm values
815 # (single quotes are a violation of the RFC, but appear in the wild)
816 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
817 'realm=(["\'])(.*?)\\2', re.I)
818
819 # XXX could pre-emptively send auth info already accepted (RFC 2617,
820 # end of section 2, and section 1.2 immediately after "credentials"
821 # production).
822
823 def __init__(self, password_mgr=None):
824 if password_mgr is None:
825 password_mgr = HTTPPasswordMgr()
826 self.passwd = password_mgr
827 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000828 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000829
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000830 def reset_retry_count(self):
831 self.retried = 0
832
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000833 def http_error_auth_reqed(self, authreq, host, req, headers):
834 # host may be an authority (without userinfo) or a URL with an
835 # authority
836 # XXX could be multiple headers
837 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000838
839 if self.retried > 5:
840 # retry sending the username:password 5 times before failing.
841 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
842 headers, None)
843 else:
844 self.retried += 1
845
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000846 if authreq:
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800847 scheme = authreq.split()[0]
Senthil Kumaran1a129c82011-10-20 02:50:13 +0800848 if scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800849 raise ValueError("AbstractBasicAuthHandler does not"
850 " support the following scheme: '%s'" %
851 scheme)
852 else:
853 mo = AbstractBasicAuthHandler.rx.search(authreq)
854 if mo:
855 scheme, quote, realm = mo.groups()
856 if scheme.lower() == 'basic':
857 response = self.retry_http_basic_auth(host, req, realm)
858 if response and response.code != 401:
859 self.retried = 0
860 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000861
862 def retry_http_basic_auth(self, host, req, realm):
863 user, pw = self.passwd.find_user_password(realm, host)
864 if pw is not None:
865 raw = "%s:%s" % (user, pw)
866 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
867 if req.headers.get(self.auth_header, None) == auth:
868 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000869 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000870 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000871 else:
872 return None
873
874
875class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
876
877 auth_header = 'Authorization'
878
879 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000880 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000881 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000882 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000883 self.reset_retry_count()
884 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000885
886
887class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
888
889 auth_header = 'Proxy-authorization'
890
891 def http_error_407(self, req, fp, code, msg, headers):
892 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000893 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000894 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
895 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000896 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000897 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000898 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000899 self.reset_retry_count()
900 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901
902
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800903# Return n random bytes.
904_randombytes = os.urandom
905
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000906
907class AbstractDigestAuthHandler:
908 # Digest authentication is specified in RFC 2617.
909
910 # XXX The client does not inspect the Authentication-Info header
911 # in a successful response.
912
913 # XXX It should be possible to test this implementation against
914 # a mock server that just generates a static set of challenges.
915
916 # XXX qop="auth-int" supports is shaky
917
918 def __init__(self, passwd=None):
919 if passwd is None:
920 passwd = HTTPPasswordMgr()
921 self.passwd = passwd
922 self.add_password = self.passwd.add_password
923 self.retried = 0
924 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000925 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000926
927 def reset_retry_count(self):
928 self.retried = 0
929
930 def http_error_auth_reqed(self, auth_header, host, req, headers):
931 authreq = headers.get(auth_header, None)
932 if self.retried > 5:
933 # Don't fail endlessly - if we failed once, we'll probably
934 # fail a second time. Hm. Unless the Password Manager is
935 # prompting for the information. Crap. This isn't great
936 # but it's better than the current 'repeat until recursion
937 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000938 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000939 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000940 else:
941 self.retried += 1
942 if authreq:
943 scheme = authreq.split()[0]
944 if scheme.lower() == 'digest':
945 return self.retry_http_digest_auth(req, authreq)
Senthil Kumaran1a129c82011-10-20 02:50:13 +0800946 elif scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800947 raise ValueError("AbstractDigestAuthHandler does not support"
948 " the following scheme: '%s'" % scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000949
950 def retry_http_digest_auth(self, req, auth):
951 token, challenge = auth.split(' ', 1)
952 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
953 auth = self.get_authorization(req, chal)
954 if auth:
955 auth_val = 'Digest %s' % auth
956 if req.headers.get(self.auth_header, None) == auth_val:
957 return None
958 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000959 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000960 return resp
961
962 def get_cnonce(self, nonce):
963 # The cnonce-value is an opaque
964 # quoted string value provided by the client and used by both client
965 # and server to avoid chosen plaintext attacks, to provide mutual
966 # authentication, and to provide some message integrity protection.
967 # This isn't a fabulous effort, but it's probably Good Enough.
968 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800969 b = s.encode("ascii") + _randombytes(8)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000970 dig = hashlib.sha1(b).hexdigest()
971 return dig[:16]
972
973 def get_authorization(self, req, chal):
974 try:
975 realm = chal['realm']
976 nonce = chal['nonce']
977 qop = chal.get('qop')
978 algorithm = chal.get('algorithm', 'MD5')
979 # mod_digest doesn't send an opaque, even though it isn't
980 # supposed to be optional
981 opaque = chal.get('opaque', None)
982 except KeyError:
983 return None
984
985 H, KD = self.get_algorithm_impls(algorithm)
986 if H is None:
987 return None
988
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000989 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000990 if user is None:
991 return None
992
993 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000994 if req.data is not None:
995 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000996 else:
997 entdig = None
998
999 A1 = "%s:%s:%s" % (user, realm, pw)
1000 A2 = "%s:%s" % (req.get_method(),
1001 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001002 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001003 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +00001004 if nonce == self.last_nonce:
1005 self.nonce_count += 1
1006 else:
1007 self.nonce_count = 1
1008 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001009 ncvalue = '%08x' % self.nonce_count
1010 cnonce = self.get_cnonce(nonce)
1011 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1012 respdig = KD(H(A1), noncebit)
1013 elif qop is None:
1014 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1015 else:
1016 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +00001017 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018
1019 # XXX should the partial digests be encoded too?
1020
1021 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001022 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001023 respdig)
1024 if opaque:
1025 base += ', opaque="%s"' % opaque
1026 if entdig:
1027 base += ', digest="%s"' % entdig
1028 base += ', algorithm="%s"' % algorithm
1029 if qop:
1030 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1031 return base
1032
1033 def get_algorithm_impls(self, algorithm):
1034 # lambdas assume digest modules are imported at the top level
1035 if algorithm == 'MD5':
1036 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1037 elif algorithm == 'SHA':
1038 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1039 # XXX MD5-sess
1040 KD = lambda s, d: H("%s:%s" % (s, d))
1041 return H, KD
1042
1043 def get_entity_digest(self, data, chal):
1044 # XXX not implemented yet
1045 return None
1046
1047
1048class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1049 """An authentication protocol defined by RFC 2069
1050
1051 Digest authentication improves on basic authentication because it
1052 does not transmit passwords in the clear.
1053 """
1054
1055 auth_header = 'Authorization'
1056 handler_order = 490 # before Basic auth
1057
1058 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001059 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001060 retry = self.http_error_auth_reqed('www-authenticate',
1061 host, req, headers)
1062 self.reset_retry_count()
1063 return retry
1064
1065
1066class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1067
1068 auth_header = 'Proxy-Authorization'
1069 handler_order = 490 # before Basic auth
1070
1071 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001072 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001073 retry = self.http_error_auth_reqed('proxy-authenticate',
1074 host, req, headers)
1075 self.reset_retry_count()
1076 return retry
1077
1078class AbstractHTTPHandler(BaseHandler):
1079
1080 def __init__(self, debuglevel=0):
1081 self._debuglevel = debuglevel
1082
1083 def set_http_debuglevel(self, level):
1084 self._debuglevel = level
1085
1086 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001087 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001088 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001089 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001090
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001091 if request.data is not None: # POST
1092 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001093 if isinstance(data, str):
1094 raise TypeError("POST data should be bytes"
1095 " or an iterable of bytes. It cannot be str.")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001096 if not request.has_header('Content-type'):
1097 request.add_unredirected_header(
1098 'Content-type',
1099 'application/x-www-form-urlencoded')
1100 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001101 try:
1102 mv = memoryview(data)
1103 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001104 if isinstance(data, collections.Iterable):
Georg Brandl61536042011-02-03 07:46:41 +00001105 raise ValueError("Content-Length should be specified "
1106 "for iterable data of type %r %r" % (type(data),
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001107 data))
1108 else:
1109 request.add_unredirected_header(
Senthil Kumaran1e991f22010-12-24 04:03:59 +00001110 'Content-length', '%d' % (len(mv) * mv.itemsize))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001111
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001112 sel_host = host
1113 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001114 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001115 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001116 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001117 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001118 for name, value in self.parent.addheaders:
1119 name = name.capitalize()
1120 if not request.has_header(name):
1121 request.add_unredirected_header(name, value)
1122
1123 return request
1124
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001125 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001126 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001127
1128 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001129 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001130 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001131 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001132 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001133
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001134 # will parse host:port
1135 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001136
1137 headers = dict(req.unredirected_hdrs)
1138 headers.update(dict((k, v) for k, v in req.headers.items()
1139 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001140
1141 # TODO(jhylton): Should this be redesigned to handle
1142 # persistent connections?
1143
1144 # We want to make an HTTP/1.1 request, but the addinfourl
1145 # class isn't prepared to deal with a persistent connection.
1146 # It will try to read all remaining data from the socket,
1147 # which will block while the server waits for the next request.
1148 # So make sure the connection gets closed after the (only)
1149 # request.
1150 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001151 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001152
1153 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001154 tunnel_headers = {}
1155 proxy_auth_hdr = "Proxy-Authorization"
1156 if proxy_auth_hdr in headers:
1157 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1158 # Proxy-Authorization should not be sent to origin
1159 # server.
1160 del headers[proxy_auth_hdr]
1161 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001162
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001163 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001164 h.request(req.get_method(), req.selector, req.data, headers)
Senthil Kumaran1299a8f2011-07-27 08:05:58 +08001165 except socket.error as err: # timeout error
Senthil Kumaran45686b42011-07-27 09:31:03 +08001166 h.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001167 raise URLError(err)
Senthil Kumaran45686b42011-07-27 09:31:03 +08001168 else:
1169 r = h.getresponse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001170
Senthil Kumaran26430412011-04-13 07:01:19 +08001171 r.url = req.get_full_url()
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001172 # This line replaces the .msg attribute of the HTTPResponse
1173 # with .headers, because urllib clients expect the response to
1174 # have the reason in .msg. It would be good to mark this
1175 # attribute is deprecated and get then to use info() or
1176 # .headers.
1177 r.msg = r.reason
1178 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001179
1180
1181class HTTPHandler(AbstractHTTPHandler):
1182
1183 def http_open(self, req):
1184 return self.do_open(http.client.HTTPConnection, req)
1185
1186 http_request = AbstractHTTPHandler.do_request_
1187
1188if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001189
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001190 class HTTPSHandler(AbstractHTTPHandler):
1191
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001192 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1193 AbstractHTTPHandler.__init__(self, debuglevel)
1194 self._context = context
1195 self._check_hostname = check_hostname
1196
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001197 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001198 return self.do_open(http.client.HTTPSConnection, req,
1199 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001200
1201 https_request = AbstractHTTPHandler.do_request_
1202
Senthil Kumaran4c875a92011-11-01 23:57:57 +08001203 __all__.append('HTTPSHandler')
Senthil Kumaran0d54eb92011-11-01 23:49:46 +08001204
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001205class HTTPCookieProcessor(BaseHandler):
1206 def __init__(self, cookiejar=None):
1207 import http.cookiejar
1208 if cookiejar is None:
1209 cookiejar = http.cookiejar.CookieJar()
1210 self.cookiejar = cookiejar
1211
1212 def http_request(self, request):
1213 self.cookiejar.add_cookie_header(request)
1214 return request
1215
1216 def http_response(self, request, response):
1217 self.cookiejar.extract_cookies(response, request)
1218 return response
1219
1220 https_request = http_request
1221 https_response = http_response
1222
1223class UnknownHandler(BaseHandler):
1224 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001225 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001226 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001227
1228def parse_keqv_list(l):
1229 """Parse list of key=value strings where keys are not duplicated."""
1230 parsed = {}
1231 for elt in l:
1232 k, v = elt.split('=', 1)
1233 if v[0] == '"' and v[-1] == '"':
1234 v = v[1:-1]
1235 parsed[k] = v
1236 return parsed
1237
1238def parse_http_list(s):
1239 """Parse lists as described by RFC 2068 Section 2.
1240
1241 In particular, parse comma-separated lists where the elements of
1242 the list may include quoted-strings. A quoted-string could
1243 contain a comma. A non-quoted string could have quotes in the
1244 middle. Neither commas nor quotes count if they are escaped.
1245 Only double-quotes count, not single-quotes.
1246 """
1247 res = []
1248 part = ''
1249
1250 escape = quote = False
1251 for cur in s:
1252 if escape:
1253 part += cur
1254 escape = False
1255 continue
1256 if quote:
1257 if cur == '\\':
1258 escape = True
1259 continue
1260 elif cur == '"':
1261 quote = False
1262 part += cur
1263 continue
1264
1265 if cur == ',':
1266 res.append(part)
1267 part = ''
1268 continue
1269
1270 if cur == '"':
1271 quote = True
1272
1273 part += cur
1274
1275 # append last part
1276 if part:
1277 res.append(part)
1278
1279 return [part.strip() for part in res]
1280
1281class FileHandler(BaseHandler):
1282 # Use local file or FTP depending on form of URL
1283 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001284 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001285 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1286 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001287 if not req.host is self.get_names():
1288 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001289 else:
1290 return self.open_local_file(req)
1291
1292 # names for the localhost
1293 names = None
1294 def get_names(self):
1295 if FileHandler.names is None:
1296 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001297 FileHandler.names = tuple(
1298 socket.gethostbyname_ex('localhost')[2] +
1299 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001300 except socket.gaierror:
1301 FileHandler.names = (socket.gethostbyname('localhost'),)
1302 return FileHandler.names
1303
1304 # not entirely sure what the rules are here
1305 def open_local_file(self, req):
1306 import email.utils
1307 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001308 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001309 filename = req.selector
1310 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001311 try:
1312 stats = os.stat(localfile)
1313 size = stats.st_size
1314 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001315 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001316 headers = email.message_from_string(
1317 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1318 (mtype or 'text/plain', size, modified))
1319 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001320 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001321 if not host or \
1322 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001323 if host:
1324 origurl = 'file://' + host + filename
1325 else:
1326 origurl = 'file://' + filename
1327 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001328 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001329 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001330 raise URLError(msg)
1331 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001332
1333def _safe_gethostbyname(host):
1334 try:
1335 return socket.gethostbyname(host)
1336 except socket.gaierror:
1337 return None
1338
1339class FTPHandler(BaseHandler):
1340 def ftp_open(self, req):
1341 import ftplib
1342 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001343 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001344 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001345 raise URLError('ftp error: no host given')
1346 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001347 if port is None:
1348 port = ftplib.FTP_PORT
1349 else:
1350 port = int(port)
1351
1352 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001353 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001354 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001355 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001356 else:
1357 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001358 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001359 user = user or ''
1360 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001361
1362 try:
1363 host = socket.gethostbyname(host)
1364 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001365 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001366 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001367 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001368 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001369 dirs, file = dirs[:-1], dirs[-1]
1370 if dirs and not dirs[0]:
1371 dirs = dirs[1:]
1372 try:
1373 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1374 type = file and 'I' or 'D'
1375 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001376 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001377 if attr.lower() == 'type' and \
1378 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1379 type = value.upper()
1380 fp, retrlen = fw.retrfile(file, type)
1381 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001382 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001383 if mtype:
1384 headers += "Content-type: %s\n" % mtype
1385 if retrlen is not None and retrlen >= 0:
1386 headers += "Content-length: %d\n" % retrlen
1387 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001388 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001389 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001390 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001391 raise exc.with_traceback(sys.exc_info()[2])
1392
1393 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001394 return ftpwrapper(user, passwd, host, port, dirs, timeout,
1395 persistent=False)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001396
1397class CacheFTPHandler(FTPHandler):
1398 # XXX would be nice to have pluggable cache strategies
1399 # XXX this stuff is definitely not thread safe
1400 def __init__(self):
1401 self.cache = {}
1402 self.timeout = {}
1403 self.soonest = 0
1404 self.delay = 60
1405 self.max_conns = 16
1406
1407 def setTimeout(self, t):
1408 self.delay = t
1409
1410 def setMaxConns(self, m):
1411 self.max_conns = m
1412
1413 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1414 key = user, host, port, '/'.join(dirs), timeout
1415 if key in self.cache:
1416 self.timeout[key] = time.time() + self.delay
1417 else:
1418 self.cache[key] = ftpwrapper(user, passwd, host, port,
1419 dirs, timeout)
1420 self.timeout[key] = time.time() + self.delay
1421 self.check_cache()
1422 return self.cache[key]
1423
1424 def check_cache(self):
1425 # first check for old ones
1426 t = time.time()
1427 if self.soonest <= t:
1428 for k, v in list(self.timeout.items()):
1429 if v < t:
1430 self.cache[k].close()
1431 del self.cache[k]
1432 del self.timeout[k]
1433 self.soonest = min(list(self.timeout.values()))
1434
1435 # then check the size
1436 if len(self.cache) == self.max_conns:
1437 for k, v in list(self.timeout.items()):
1438 if v == self.soonest:
1439 del self.cache[k]
1440 del self.timeout[k]
1441 break
1442 self.soonest = min(list(self.timeout.values()))
1443
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001444 def clear_cache(self):
1445 for conn in self.cache.values():
1446 conn.close()
1447 self.cache.clear()
1448 self.timeout.clear()
1449
1450
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001451# Code move from the old urllib module
1452
1453MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1454
1455# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001456if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001457 from nturl2path import url2pathname, pathname2url
1458else:
1459 def url2pathname(pathname):
1460 """OS-specific conversion from a relative URL of the 'file' scheme
1461 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001462 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001463
1464 def pathname2url(pathname):
1465 """OS-specific conversion from a file system path to a relative URL
1466 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001467 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001468
1469# This really consists of two pieces:
1470# (1) a class which handles opening of all sorts of URLs
1471# (plus assorted utilities etc.)
1472# (2) a set of functions for parsing URLs
1473# XXX Should these be separated out into different modules?
1474
1475
1476ftpcache = {}
1477class URLopener:
1478 """Class to open URLs.
1479 This is a class rather than just a subroutine because we may need
1480 more than one set of global protocol-specific options.
1481 Note -- this is a base class for those who don't want the
1482 automatic handling of errors type 302 (relocated) and 401
1483 (authorization needed)."""
1484
1485 __tempfiles = None
1486
1487 version = "Python-urllib/%s" % __version__
1488
1489 # Constructor
1490 def __init__(self, proxies=None, **x509):
1491 if proxies is None:
1492 proxies = getproxies()
1493 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1494 self.proxies = proxies
1495 self.key_file = x509.get('key_file')
1496 self.cert_file = x509.get('cert_file')
1497 self.addheaders = [('User-Agent', self.version)]
1498 self.__tempfiles = []
1499 self.__unlink = os.unlink # See cleanup()
1500 self.tempcache = None
1501 # Undocumented feature: if you assign {} to tempcache,
1502 # it is used to cache files retrieved with
1503 # self.retrieve(). This is not enabled by default
1504 # since it does not work for changing documents (and I
1505 # haven't got the logic to check expiration headers
1506 # yet).
1507 self.ftpcache = ftpcache
1508 # Undocumented feature: you can use a different
1509 # ftp cache by assigning to the .ftpcache member;
1510 # in case you want logically independent URL openers
1511 # XXX This is not threadsafe. Bah.
1512
1513 def __del__(self):
1514 self.close()
1515
1516 def close(self):
1517 self.cleanup()
1518
1519 def cleanup(self):
1520 # This code sometimes runs when the rest of this module
1521 # has already been deleted, so it can't use any globals
1522 # or import anything.
1523 if self.__tempfiles:
1524 for file in self.__tempfiles:
1525 try:
1526 self.__unlink(file)
1527 except OSError:
1528 pass
1529 del self.__tempfiles[:]
1530 if self.tempcache:
1531 self.tempcache.clear()
1532
1533 def addheader(self, *args):
1534 """Add a header to be used by the HTTP interface only
1535 e.g. u.addheader('Accept', 'sound/basic')"""
1536 self.addheaders.append(args)
1537
1538 # External interface
1539 def open(self, fullurl, data=None):
1540 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001541 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001542 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001543 if self.tempcache and fullurl in self.tempcache:
1544 filename, headers = self.tempcache[fullurl]
1545 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001546 return addinfourl(fp, headers, fullurl)
1547 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001548 if not urltype:
1549 urltype = 'file'
1550 if urltype in self.proxies:
1551 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001552 urltype, proxyhost = splittype(proxy)
1553 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001554 url = (host, fullurl) # Signal special case to open_*()
1555 else:
1556 proxy = None
1557 name = 'open_' + urltype
1558 self.type = urltype
1559 name = name.replace('-', '_')
1560 if not hasattr(self, name):
1561 if proxy:
1562 return self.open_unknown_proxy(proxy, fullurl, data)
1563 else:
1564 return self.open_unknown(fullurl, data)
1565 try:
1566 if data is None:
1567 return getattr(self, name)(url)
1568 else:
1569 return getattr(self, name)(url, data)
Antoine Pitrou6b4883d2011-10-12 02:54:14 +02001570 except HTTPError:
1571 raise
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001572 except socket.error as msg:
1573 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1574
1575 def open_unknown(self, fullurl, data=None):
1576 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001577 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001578 raise IOError('url error', 'unknown url type', type)
1579
1580 def open_unknown_proxy(self, proxy, fullurl, data=None):
1581 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001582 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001583 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1584
1585 # External interface
1586 def retrieve(self, url, filename=None, reporthook=None, data=None):
1587 """retrieve(url) returns (filename, headers) for a local object
1588 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001589 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001590 if self.tempcache and url in self.tempcache:
1591 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001592 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001593 if filename is None and (not type or type == 'file'):
1594 try:
1595 fp = self.open_local_file(url1)
1596 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001597 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001598 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001599 except IOError as msg:
1600 pass
1601 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001602 try:
1603 headers = fp.info()
1604 if filename:
1605 tfp = open(filename, 'wb')
1606 else:
1607 import tempfile
1608 garbage, path = splittype(url)
1609 garbage, path = splithost(path or "")
1610 path, garbage = splitquery(path or "")
1611 path, garbage = splitattr(path or "")
1612 suffix = os.path.splitext(path)[1]
1613 (fd, filename) = tempfile.mkstemp(suffix)
1614 self.__tempfiles.append(filename)
1615 tfp = os.fdopen(fd, 'wb')
1616 try:
1617 result = filename, headers
1618 if self.tempcache is not None:
1619 self.tempcache[url] = result
1620 bs = 1024*8
1621 size = -1
1622 read = 0
1623 blocknum = 0
Senthil Kumarance260142011-11-01 01:35:17 +08001624 if "content-length" in headers:
1625 size = int(headers["Content-Length"])
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001626 if reporthook:
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001627 reporthook(blocknum, bs, size)
1628 while 1:
1629 block = fp.read(bs)
1630 if not block:
1631 break
1632 read += len(block)
1633 tfp.write(block)
1634 blocknum += 1
1635 if reporthook:
1636 reporthook(blocknum, bs, size)
1637 finally:
1638 tfp.close()
1639 finally:
1640 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001641
1642 # raise exception if actual size does not match content-length header
1643 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001644 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001645 "retrieval incomplete: got only %i out of %i bytes"
1646 % (read, size), result)
1647
1648 return result
1649
1650 # Each method named open_<type> knows how to open that type of URL
1651
1652 def _open_generic_http(self, connection_factory, url, data):
1653 """Make an HTTP connection using connection_class.
1654
1655 This is an internal method that should be called from
1656 open_http() or open_https().
1657
1658 Arguments:
1659 - connection_factory should take a host name and return an
1660 HTTPConnection instance.
1661 - url is the url to retrieval or a host, relative-path pair.
1662 - data is payload for a POST request or None.
1663 """
1664
1665 user_passwd = None
1666 proxy_passwd= None
1667 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001668 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001669 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001670 user_passwd, host = splituser(host)
1671 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001672 realhost = host
1673 else:
1674 host, selector = url
1675 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001676 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001677 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001678 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001679 url = rest
1680 user_passwd = None
1681 if urltype.lower() != 'http':
1682 realhost = None
1683 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001684 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001685 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001686 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001687 if user_passwd:
1688 selector = "%s://%s%s" % (urltype, realhost, rest)
1689 if proxy_bypass(realhost):
1690 host = realhost
1691
1692 #print "proxy via http:", host, selector
1693 if not host: raise IOError('http error', 'no host given')
1694
1695 if proxy_passwd:
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001696 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001697 else:
1698 proxy_auth = None
1699
1700 if user_passwd:
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001701 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001702 else:
1703 auth = None
1704 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001705 headers = {}
1706 if proxy_auth:
1707 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1708 if auth:
1709 headers["Authorization"] = "Basic %s" % auth
1710 if realhost:
1711 headers["Host"] = realhost
Senthil Kumarand91ffca2011-03-19 17:25:27 +08001712
1713 # Add Connection:close as we don't support persistent connections yet.
1714 # This helps in closing the socket and avoiding ResourceWarning
1715
1716 headers["Connection"] = "close"
1717
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001718 for header, value in self.addheaders:
1719 headers[header] = value
1720
1721 if data is not None:
1722 headers["Content-Type"] = "application/x-www-form-urlencoded"
1723 http_conn.request("POST", selector, data, headers)
1724 else:
1725 http_conn.request("GET", selector, headers=headers)
1726
1727 try:
1728 response = http_conn.getresponse()
1729 except http.client.BadStatusLine:
1730 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001731 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001732
1733 # According to RFC 2616, "2xx" code indicates that the client's
1734 # request was successfully received, understood, and accepted.
1735 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001736 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001737 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001738 else:
1739 return self.http_error(
1740 url, response.fp,
1741 response.status, response.reason, response.msg, data)
1742
1743 def open_http(self, url, data=None):
1744 """Use HTTP protocol."""
1745 return self._open_generic_http(http.client.HTTPConnection, url, data)
1746
1747 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1748 """Handle http errors.
1749
1750 Derived class can override this, or provide specific handlers
1751 named http_error_DDD where DDD is the 3-digit error code."""
1752 # First check if there's a specific handler for this error
1753 name = 'http_error_%d' % errcode
1754 if hasattr(self, name):
1755 method = getattr(self, name)
1756 if data is None:
1757 result = method(url, fp, errcode, errmsg, headers)
1758 else:
1759 result = method(url, fp, errcode, errmsg, headers, data)
1760 if result: return result
1761 return self.http_error_default(url, fp, errcode, errmsg, headers)
1762
1763 def http_error_default(self, url, fp, errcode, errmsg, headers):
1764 """Default error handler: close the connection and raise IOError."""
1765 void = fp.read()
1766 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001767 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001768
1769 if _have_ssl:
1770 def _https_connection(self, host):
1771 return http.client.HTTPSConnection(host,
1772 key_file=self.key_file,
1773 cert_file=self.cert_file)
1774
1775 def open_https(self, url, data=None):
1776 """Use HTTPS protocol."""
1777 return self._open_generic_http(self._https_connection, url, data)
1778
1779 def open_file(self, url):
1780 """Use local file or FTP depending on form of URL."""
1781 if not isinstance(url, str):
1782 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1783 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001784 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001785 else:
1786 return self.open_local_file(url)
1787
1788 def open_local_file(self, url):
1789 """Use local file."""
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001790 import email.utils
1791 import mimetypes
Georg Brandl13e89462008-07-01 19:56:00 +00001792 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001793 localname = url2pathname(file)
1794 try:
1795 stats = os.stat(localname)
1796 except OSError as e:
1797 raise URLError(e.errno, e.strerror, e.filename)
1798 size = stats.st_size
1799 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1800 mtype = mimetypes.guess_type(url)[0]
1801 headers = email.message_from_string(
1802 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1803 (mtype or 'text/plain', size, modified))
1804 if not host:
1805 urlfile = file
1806 if file[:1] == '/':
1807 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001808 return addinfourl(open(localname, 'rb'), headers, urlfile)
1809 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001810 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001811 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001812 urlfile = file
1813 if file[:1] == '/':
1814 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001815 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001816 raise URLError('local file error', 'not on local host')
1817
1818 def open_ftp(self, url):
1819 """Use FTP protocol."""
1820 if not isinstance(url, str):
1821 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1822 import mimetypes
Georg Brandl13e89462008-07-01 19:56:00 +00001823 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001824 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001825 host, port = splitport(host)
1826 user, host = splituser(host)
1827 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001828 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001829 host = unquote(host)
1830 user = unquote(user or '')
1831 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001832 host = socket.gethostbyname(host)
1833 if not port:
1834 import ftplib
1835 port = ftplib.FTP_PORT
1836 else:
1837 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001838 path, attrs = splitattr(path)
1839 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001840 dirs = path.split('/')
1841 dirs, file = dirs[:-1], dirs[-1]
1842 if dirs and not dirs[0]: dirs = dirs[1:]
1843 if dirs and not dirs[0]: dirs[0] = '/'
1844 key = user, host, port, '/'.join(dirs)
1845 # XXX thread unsafe!
1846 if len(self.ftpcache) > MAXFTPCACHE:
1847 # Prune the cache, rather arbitrarily
1848 for k in self.ftpcache.keys():
1849 if k != key:
1850 v = self.ftpcache[k]
1851 del self.ftpcache[k]
1852 v.close()
1853 try:
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001854 if key not in self.ftpcache:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001855 self.ftpcache[key] = \
1856 ftpwrapper(user, passwd, host, port, dirs)
1857 if not file: type = 'D'
1858 else: type = 'I'
1859 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001860 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001861 if attr.lower() == 'type' and \
1862 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1863 type = value.upper()
1864 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1865 mtype = mimetypes.guess_type("ftp:" + url)[0]
1866 headers = ""
1867 if mtype:
1868 headers += "Content-Type: %s\n" % mtype
1869 if retrlen is not None and retrlen >= 0:
1870 headers += "Content-Length: %d\n" % retrlen
1871 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001872 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001873 except ftperrors() as msg:
1874 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1875
1876 def open_data(self, url, data=None):
1877 """Use "data" URL."""
1878 if not isinstance(url, str):
1879 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1880 # ignore POSTed data
1881 #
1882 # syntax of data URLs:
1883 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1884 # mediatype := [ type "/" subtype ] *( ";" parameter )
1885 # data := *urlchar
1886 # parameter := attribute "=" value
1887 try:
1888 [type, data] = url.split(',', 1)
1889 except ValueError:
1890 raise IOError('data error', 'bad data URL')
1891 if not type:
1892 type = 'text/plain;charset=US-ASCII'
1893 semi = type.rfind(';')
1894 if semi >= 0 and '=' not in type[semi:]:
1895 encoding = type[semi+1:]
1896 type = type[:semi]
1897 else:
1898 encoding = ''
1899 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001900 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001901 time.gmtime(time.time())))
1902 msg.append('Content-type: %s' % type)
1903 if encoding == 'base64':
Georg Brandl706824f2009-06-04 09:42:55 +00001904 # XXX is this encoding/decoding ok?
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001905 data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001906 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001907 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001908 msg.append('Content-Length: %d' % len(data))
1909 msg.append('')
1910 msg.append(data)
1911 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001912 headers = email.message_from_string(msg)
1913 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001914 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001915 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001916
1917
1918class FancyURLopener(URLopener):
1919 """Derived class with handlers for errors we can handle (perhaps)."""
1920
1921 def __init__(self, *args, **kwargs):
1922 URLopener.__init__(self, *args, **kwargs)
1923 self.auth_cache = {}
1924 self.tries = 0
1925 self.maxtries = 10
1926
1927 def http_error_default(self, url, fp, errcode, errmsg, headers):
1928 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001929 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001930
1931 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1932 """Error 302 -- relocated (temporarily)."""
1933 self.tries += 1
1934 if self.maxtries and self.tries >= self.maxtries:
1935 if hasattr(self, "http_error_500"):
1936 meth = self.http_error_500
1937 else:
1938 meth = self.http_error_default
1939 self.tries = 0
1940 return meth(url, fp, 500,
1941 "Internal Server Error: Redirect Recursion", headers)
1942 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1943 data)
1944 self.tries = 0
1945 return result
1946
1947 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1948 if 'location' in headers:
1949 newurl = headers['location']
1950 elif 'uri' in headers:
1951 newurl = headers['uri']
1952 else:
1953 return
1954 void = fp.read()
1955 fp.close()
guido@google.coma119df92011-03-29 11:41:02 -07001956
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001957 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001958 newurl = urljoin(self.type + ":" + url, newurl)
guido@google.coma119df92011-03-29 11:41:02 -07001959
1960 urlparts = urlparse(newurl)
1961
1962 # For security reasons, we don't allow redirection to anything other
1963 # than http, https and ftp.
1964
1965 # We are using newer HTTPError with older redirect_internal method
1966 # This older method will get deprecated in 3.3
1967
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001968 if urlparts.scheme not in ('http', 'https', 'ftp'):
guido@google.coma119df92011-03-29 11:41:02 -07001969 raise HTTPError(newurl, errcode,
1970 errmsg +
1971 " Redirection to url '%s' is not allowed." % newurl,
1972 headers, fp)
1973
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001974 return self.open(newurl)
1975
1976 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1977 """Error 301 -- also relocated (permanently)."""
1978 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1979
1980 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1981 """Error 303 -- also relocated (essentially identical to 302)."""
1982 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1983
1984 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1985 """Error 307 -- relocated, but turn POST into error."""
1986 if data is None:
1987 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1988 else:
1989 return self.http_error_default(url, fp, errcode, errmsg, headers)
1990
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001991 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1992 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001993 """Error 401 -- authentication required.
1994 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001995 if 'www-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001996 URLopener.http_error_default(self, url, fp,
1997 errcode, errmsg, headers)
1998 stuff = headers['www-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001999 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2000 if not match:
2001 URLopener.http_error_default(self, url, fp,
2002 errcode, errmsg, headers)
2003 scheme, realm = match.groups()
2004 if scheme.lower() != 'basic':
2005 URLopener.http_error_default(self, url, fp,
2006 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002007 if not retry:
2008 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2009 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002010 name = 'retry_' + self.type + '_basic_auth'
2011 if data is None:
2012 return getattr(self,name)(url, realm)
2013 else:
2014 return getattr(self,name)(url, realm, data)
2015
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002016 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2017 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002018 """Error 407 -- proxy authentication required.
2019 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002020 if 'proxy-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002021 URLopener.http_error_default(self, url, fp,
2022 errcode, errmsg, headers)
2023 stuff = headers['proxy-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002024 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2025 if not match:
2026 URLopener.http_error_default(self, url, fp,
2027 errcode, errmsg, headers)
2028 scheme, realm = match.groups()
2029 if scheme.lower() != 'basic':
2030 URLopener.http_error_default(self, url, fp,
2031 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002032 if not retry:
2033 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2034 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002035 name = 'retry_proxy_' + self.type + '_basic_auth'
2036 if data is None:
2037 return getattr(self,name)(url, realm)
2038 else:
2039 return getattr(self,name)(url, realm, data)
2040
2041 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002042 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002043 newurl = 'http://' + host + selector
2044 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00002045 urltype, proxyhost = splittype(proxy)
2046 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002047 i = proxyhost.find('@') + 1
2048 proxyhost = proxyhost[i:]
2049 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2050 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002051 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002052 quote(passwd, safe=''), proxyhost)
2053 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2054 if data is None:
2055 return self.open(newurl)
2056 else:
2057 return self.open(newurl, data)
2058
2059 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002060 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002061 newurl = 'https://' + host + selector
2062 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00002063 urltype, proxyhost = splittype(proxy)
2064 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002065 i = proxyhost.find('@') + 1
2066 proxyhost = proxyhost[i:]
2067 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2068 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002069 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002070 quote(passwd, safe=''), proxyhost)
2071 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2072 if data is None:
2073 return self.open(newurl)
2074 else:
2075 return self.open(newurl, data)
2076
2077 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002078 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002079 i = host.find('@') + 1
2080 host = host[i:]
2081 user, passwd = self.get_user_passwd(host, realm, i)
2082 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002083 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002084 quote(passwd, safe=''), host)
2085 newurl = 'http://' + host + selector
2086 if data is None:
2087 return self.open(newurl)
2088 else:
2089 return self.open(newurl, data)
2090
2091 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002092 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002093 i = host.find('@') + 1
2094 host = host[i:]
2095 user, passwd = self.get_user_passwd(host, realm, i)
2096 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002097 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002098 quote(passwd, safe=''), host)
2099 newurl = 'https://' + host + selector
2100 if data is None:
2101 return self.open(newurl)
2102 else:
2103 return self.open(newurl, data)
2104
Florent Xicluna757445b2010-05-17 17:24:07 +00002105 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002106 key = realm + '@' + host.lower()
2107 if key in self.auth_cache:
2108 if clear_cache:
2109 del self.auth_cache[key]
2110 else:
2111 return self.auth_cache[key]
2112 user, passwd = self.prompt_user_passwd(host, realm)
2113 if user or passwd: self.auth_cache[key] = (user, passwd)
2114 return user, passwd
2115
2116 def prompt_user_passwd(self, host, realm):
2117 """Override this in a GUI environment!"""
2118 import getpass
2119 try:
2120 user = input("Enter username for %s at %s: " % (realm, host))
2121 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2122 (user, realm, host))
2123 return user, passwd
2124 except KeyboardInterrupt:
2125 print()
2126 return None, None
2127
2128
2129# Utility functions
2130
2131_localhost = None
2132def localhost():
2133 """Return the IP address of the magic hostname 'localhost'."""
2134 global _localhost
2135 if _localhost is None:
2136 _localhost = socket.gethostbyname('localhost')
2137 return _localhost
2138
2139_thishost = None
2140def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002141 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002142 global _thishost
2143 if _thishost is None:
Senthil Kumaran1b7da512011-10-06 00:32:02 +08002144 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002145 return _thishost
2146
2147_ftperrors = None
2148def ftperrors():
2149 """Return the set of errors raised by the FTP class."""
2150 global _ftperrors
2151 if _ftperrors is None:
2152 import ftplib
2153 _ftperrors = ftplib.all_errors
2154 return _ftperrors
2155
2156_noheaders = None
2157def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002158 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002159 global _noheaders
2160 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002161 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002162 return _noheaders
2163
2164
2165# Utility classes
2166
2167class ftpwrapper:
2168 """Class used by open_ftp() for cache of open FTP connections."""
2169
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002170 def __init__(self, user, passwd, host, port, dirs, timeout=None,
2171 persistent=True):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002172 self.user = user
2173 self.passwd = passwd
2174 self.host = host
2175 self.port = port
2176 self.dirs = dirs
2177 self.timeout = timeout
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002178 self.refcount = 0
2179 self.keepalive = persistent
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002180 self.init()
2181
2182 def init(self):
2183 import ftplib
2184 self.busy = 0
2185 self.ftp = ftplib.FTP()
2186 self.ftp.connect(self.host, self.port, self.timeout)
2187 self.ftp.login(self.user, self.passwd)
2188 for dir in self.dirs:
2189 self.ftp.cwd(dir)
2190
2191 def retrfile(self, file, type):
2192 import ftplib
2193 self.endtransfer()
2194 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2195 else: cmd = 'TYPE ' + type; isdir = 0
2196 try:
2197 self.ftp.voidcmd(cmd)
2198 except ftplib.all_errors:
2199 self.init()
2200 self.ftp.voidcmd(cmd)
2201 conn = None
2202 if file and not isdir:
2203 # Try to retrieve as a file
2204 try:
2205 cmd = 'RETR ' + file
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002206 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002207 except ftplib.error_perm as reason:
2208 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002209 raise URLError('ftp error', reason).with_traceback(
2210 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002211 if not conn:
2212 # Set transfer mode to ASCII!
2213 self.ftp.voidcmd('TYPE A')
2214 # Try a directory listing. Verify that directory exists.
2215 if file:
2216 pwd = self.ftp.pwd()
2217 try:
2218 try:
2219 self.ftp.cwd(file)
2220 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002221 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002222 finally:
2223 self.ftp.cwd(pwd)
2224 cmd = 'LIST ' + file
2225 else:
2226 cmd = 'LIST'
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002227 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002228 self.busy = 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002229
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002230 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2231 self.refcount += 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002232 conn.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002233 # Pass back both a suitably decorated object and a retrieval length
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002234 return (ftpobj, retrlen)
2235
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002236 def endtransfer(self):
2237 if not self.busy:
2238 return
2239 self.busy = 0
2240 try:
2241 self.ftp.voidresp()
2242 except ftperrors():
2243 pass
2244
2245 def close(self):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002246 self.keepalive = False
2247 if self.refcount <= 0:
2248 self.real_close()
2249
2250 def file_close(self):
2251 self.endtransfer()
2252 self.refcount -= 1
2253 if self.refcount <= 0 and not self.keepalive:
2254 self.real_close()
2255
2256 def real_close(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002257 self.endtransfer()
2258 try:
2259 self.ftp.close()
2260 except ftperrors():
2261 pass
2262
2263# Proxy handling
2264def getproxies_environment():
2265 """Return a dictionary of scheme -> proxy server URL mappings.
2266
2267 Scan the environment for variables named <scheme>_proxy;
2268 this seems to be the standard convention. If you need a
2269 different way, you can pass a proxies dictionary to the
2270 [Fancy]URLopener constructor.
2271
2272 """
2273 proxies = {}
2274 for name, value in os.environ.items():
2275 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002276 if value and name[-6:] == '_proxy':
2277 proxies[name[:-6]] = value
2278 return proxies
2279
2280def proxy_bypass_environment(host):
2281 """Test if proxies should not be used for a particular host.
2282
2283 Checks the environment for a variable named no_proxy, which should
2284 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2285 """
2286 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2287 # '*' is special case for always bypass
2288 if no_proxy == '*':
2289 return 1
2290 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002291 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002292 # check if the host ends with any of the DNS suffixes
Senthil Kumaran89976f12011-08-06 12:27:40 +08002293 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2294 for name in no_proxy_list:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002295 if name and (hostonly.endswith(name) or host.endswith(name)):
2296 return 1
2297 # otherwise, don't bypass
2298 return 0
2299
2300
Ronald Oussorene72e1612011-03-14 18:15:25 -04002301# This code tests an OSX specific data structure but is testable on all
2302# platforms
2303def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2304 """
2305 Return True iff this host shouldn't be accessed using a proxy
2306
2307 This function uses the MacOSX framework SystemConfiguration
2308 to fetch the proxy information.
2309
2310 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2311 { 'exclude_simple': bool,
2312 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2313 }
2314 """
Ronald Oussorene72e1612011-03-14 18:15:25 -04002315 from fnmatch import fnmatch
2316
2317 hostonly, port = splitport(host)
2318
2319 def ip2num(ipAddr):
2320 parts = ipAddr.split('.')
2321 parts = list(map(int, parts))
2322 if len(parts) != 4:
2323 parts = (parts + [0, 0, 0, 0])[:4]
2324 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2325
2326 # Check for simple host names:
2327 if '.' not in host:
2328 if proxy_settings['exclude_simple']:
2329 return True
2330
2331 hostIP = None
2332
2333 for value in proxy_settings.get('exceptions', ()):
2334 # Items in the list are strings like these: *.local, 169.254/16
2335 if not value: continue
2336
2337 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2338 if m is not None:
2339 if hostIP is None:
2340 try:
2341 hostIP = socket.gethostbyname(hostonly)
2342 hostIP = ip2num(hostIP)
2343 except socket.error:
2344 continue
2345
2346 base = ip2num(m.group(1))
2347 mask = m.group(2)
2348 if mask is None:
2349 mask = 8 * (m.group(1).count('.') + 1)
2350 else:
2351 mask = int(mask[1:])
2352 mask = 32 - mask
2353
2354 if (hostIP >> mask) == (base >> mask):
2355 return True
2356
2357 elif fnmatch(host, value):
2358 return True
2359
2360 return False
2361
2362
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002363if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002364 from _scproxy import _get_proxy_settings, _get_proxies
2365
2366 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren84151202010-04-18 20:46:11 +00002367 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002368 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren84151202010-04-18 20:46:11 +00002369
2370 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002371 """Return a dictionary of scheme -> proxy server URL mappings.
2372
Ronald Oussoren84151202010-04-18 20:46:11 +00002373 This function uses the MacOSX framework SystemConfiguration
2374 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002375 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002376 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002377
Ronald Oussoren84151202010-04-18 20:46:11 +00002378
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002379
2380 def proxy_bypass(host):
2381 if getproxies_environment():
2382 return proxy_bypass_environment(host)
2383 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002384 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002385
2386 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002387 return getproxies_environment() or getproxies_macosx_sysconf()
2388
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002389
2390elif os.name == 'nt':
2391 def getproxies_registry():
2392 """Return a dictionary of scheme -> proxy server URL mappings.
2393
2394 Win32 uses the registry to store proxies.
2395
2396 """
2397 proxies = {}
2398 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002399 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002400 except ImportError:
2401 # Std module, so should be around - but you never know!
2402 return proxies
2403 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002404 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002405 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002406 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002407 'ProxyEnable')[0]
2408 if proxyEnable:
2409 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002410 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002411 'ProxyServer')[0])
2412 if '=' in proxyServer:
2413 # Per-protocol settings
2414 for p in proxyServer.split(';'):
2415 protocol, address = p.split('=', 1)
2416 # See if address has a type:// prefix
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002417 if not re.match('^([^/:]+)://', address):
2418 address = '%s://%s' % (protocol, address)
2419 proxies[protocol] = address
2420 else:
2421 # Use one setting for all protocols
2422 if proxyServer[:5] == 'http:':
2423 proxies['http'] = proxyServer
2424 else:
2425 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002426 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002427 proxies['ftp'] = 'ftp://%s' % proxyServer
2428 internetSettings.Close()
2429 except (WindowsError, ValueError, TypeError):
2430 # Either registry key not found etc, or the value in an
2431 # unexpected format.
2432 # proxies already set up to be empty so nothing to do
2433 pass
2434 return proxies
2435
2436 def getproxies():
2437 """Return a dictionary of scheme -> proxy server URL mappings.
2438
2439 Returns settings gathered from the environment, if specified,
2440 or the registry.
2441
2442 """
2443 return getproxies_environment() or getproxies_registry()
2444
2445 def proxy_bypass_registry(host):
2446 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002447 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002448 except ImportError:
2449 # Std modules, so should be around - but you never know!
2450 return 0
2451 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002452 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002453 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002454 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002455 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002456 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002457 'ProxyOverride')[0])
2458 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2459 except WindowsError:
2460 return 0
2461 if not proxyEnable or not proxyOverride:
2462 return 0
2463 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002464 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002465 host = [rawHost]
2466 try:
2467 addr = socket.gethostbyname(rawHost)
2468 if addr != rawHost:
2469 host.append(addr)
2470 except socket.error:
2471 pass
2472 try:
2473 fqdn = socket.getfqdn(rawHost)
2474 if fqdn != rawHost:
2475 host.append(fqdn)
2476 except socket.error:
2477 pass
2478 # make a check value list from the registry entry: replace the
2479 # '<local>' string by the localhost entry and the corresponding
2480 # canonical entry.
2481 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002482 # now check if we match one of the registry values.
2483 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002484 if test == '<local>':
2485 if '.' not in rawHost:
2486 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002487 test = test.replace(".", r"\.") # mask dots
2488 test = test.replace("*", r".*") # change glob sequence
2489 test = test.replace("?", r".") # change glob char
2490 for val in host:
2491 # print "%s <--> %s" %( test, val )
2492 if re.match(test, val, re.I):
2493 return 1
2494 return 0
2495
2496 def proxy_bypass(host):
2497 """Return a dictionary of scheme -> proxy server URL mappings.
2498
2499 Returns settings gathered from the environment, if specified,
2500 or the registry.
2501
2502 """
2503 if getproxies_environment():
2504 return proxy_bypass_environment(host)
2505 else:
2506 return proxy_bypass_registry(host)
2507
2508else:
2509 # By default use environment variables
2510 getproxies = getproxies_environment
2511 proxy_bypass = proxy_bypass_environment