blob: 90dfcffe568101a142913e90b2c92ccecd98b136 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
Jeremy Hylton1afc1692008-06-18 20:49:58 +000092import re
93import socket
94import sys
95import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000096import collections
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097
Georg Brandl13e89462008-07-01 19:56:00 +000098from urllib.error import URLError, HTTPError, ContentTooShortError
99from urllib.parse import (
100 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
101 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000102 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000103from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000104
105# check for SSL
106try:
107 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000108except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000109 _have_ssl = False
110else:
111 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000112
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800113__all__ = [
114 # Classes
115 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
116 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
117 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
118 'AbstractBasicAuthHandler', 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler',
119 'AbstractDigestAuthHandler', 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler',
120 'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler',
121 'UnknownHandler', 'HTTPErrorProcessor',
122 # Functions
123 'urlopen', 'install_opener', 'build_opener',
124 'pathname2url', 'url2pathname', 'getproxies',
125 # Legacy interface
126 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
127]
128
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000129# used in User-Agent header sent
130__version__ = sys.version[:3]
131
132_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000133def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
134 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000135 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000136 if cafile or capath:
137 if not _have_ssl:
138 raise ValueError('SSL support not available')
139 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
140 context.options |= ssl.OP_NO_SSLv2
141 if cafile or capath:
142 context.verify_mode = ssl.CERT_REQUIRED
143 context.load_verify_locations(cafile, capath)
144 check_hostname = True
145 else:
146 check_hostname = False
147 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
148 opener = build_opener(https_handler)
149 elif _opener is None:
150 _opener = opener = build_opener()
151 else:
152 opener = _opener
153 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000154
155def install_opener(opener):
156 global _opener
157 _opener = opener
158
159# TODO(jhylton): Make this work with the same global opener.
160_urlopener = None
161def urlretrieve(url, filename=None, reporthook=None, data=None):
162 global _urlopener
163 if not _urlopener:
164 _urlopener = FancyURLopener()
165 return _urlopener.retrieve(url, filename, reporthook, data)
166
167def urlcleanup():
168 if _urlopener:
169 _urlopener.cleanup()
170 global _opener
171 if _opener:
172 _opener = None
173
174# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000175_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000176def request_host(request):
177 """Return request-host, as defined by RFC 2965.
178
179 Variation from RFC: returned value is lowercased, for convenient
180 comparison.
181
182 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000183 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000184 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185 if host == "":
186 host = request.get_header("Host", "")
187
188 # remove port, if present
189 host = _cut_port_re.sub("", host, 1)
190 return host.lower()
191
192class Request:
193
194 def __init__(self, url, data=None, headers={},
Senthil Kumarande49d642011-10-16 23:54:44 +0800195 origin_req_host=None, unverifiable=False,
196 method=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000197 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000198 self.full_url = unwrap(url)
Senthil Kumaran26430412011-04-13 07:01:19 +0800199 self.full_url, self.fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000200 self.data = data
201 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000202 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000203 for key, value in headers.items():
204 self.add_header(key, value)
205 self.unredirected_hdrs = {}
206 if origin_req_host is None:
207 origin_req_host = request_host(self)
208 self.origin_req_host = origin_req_host
209 self.unverifiable = unverifiable
Senthil Kumarande49d642011-10-16 23:54:44 +0800210 self.method = method
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000211 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000212
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000213 def _parse(self):
214 self.type, rest = splittype(self.full_url)
215 if self.type is None:
216 raise ValueError("unknown url type: %s" % self.full_url)
217 self.host, self.selector = splithost(rest)
218 if self.host:
219 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000220
221 def get_method(self):
Senthil Kumarande49d642011-10-16 23:54:44 +0800222 """Return a string indicating the HTTP request method."""
223 if self.method is not None:
224 return self.method
225 elif self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000226 return "POST"
227 else:
228 return "GET"
229
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000230 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000231
232 def add_data(self, data):
233 self.data = data
234
235 def has_data(self):
236 return self.data is not None
237
238 def get_data(self):
239 return self.data
240
241 def get_full_url(self):
Senthil Kumaran26430412011-04-13 07:01:19 +0800242 if self.fragment:
243 return '%s#%s' % (self.full_url, self.fragment)
244 else:
245 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000246
247 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000248 return self.type
249
250 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000251 return self.host
252
253 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000254 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000255
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000256 def is_unverifiable(self):
257 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000258
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000259 def get_origin_req_host(self):
260 return self.origin_req_host
261
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000262 # End deprecated methods
263
264 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000265 if self.type == 'https' and not self._tunnel_host:
266 self._tunnel_host = self.host
267 else:
268 self.type= type
269 self.selector = self.full_url
270 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000271
272 def has_proxy(self):
273 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000274
275 def add_header(self, key, val):
276 # useful for something like authentication
277 self.headers[key.capitalize()] = val
278
279 def add_unredirected_header(self, key, val):
280 # will not be added to a redirected request
281 self.unredirected_hdrs[key.capitalize()] = val
282
283 def has_header(self, header_name):
284 return (header_name in self.headers or
285 header_name in self.unredirected_hdrs)
286
287 def get_header(self, header_name, default=None):
288 return self.headers.get(
289 header_name,
290 self.unredirected_hdrs.get(header_name, default))
291
292 def header_items(self):
293 hdrs = self.unredirected_hdrs.copy()
294 hdrs.update(self.headers)
295 return list(hdrs.items())
296
297class OpenerDirector:
298 def __init__(self):
299 client_version = "Python-urllib/%s" % __version__
300 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000301 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000302 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000303 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000304 self.handle_open = {}
305 self.handle_error = {}
306 self.process_response = {}
307 self.process_request = {}
308
309 def add_handler(self, handler):
310 if not hasattr(handler, "add_parent"):
311 raise TypeError("expected BaseHandler instance, got %r" %
312 type(handler))
313
314 added = False
315 for meth in dir(handler):
316 if meth in ["redirect_request", "do_open", "proxy_open"]:
317 # oops, coincidental match
318 continue
319
320 i = meth.find("_")
321 protocol = meth[:i]
322 condition = meth[i+1:]
323
324 if condition.startswith("error"):
325 j = condition.find("_") + i + 1
326 kind = meth[j+1:]
327 try:
328 kind = int(kind)
329 except ValueError:
330 pass
331 lookup = self.handle_error.get(protocol, {})
332 self.handle_error[protocol] = lookup
333 elif condition == "open":
334 kind = protocol
335 lookup = self.handle_open
336 elif condition == "response":
337 kind = protocol
338 lookup = self.process_response
339 elif condition == "request":
340 kind = protocol
341 lookup = self.process_request
342 else:
343 continue
344
345 handlers = lookup.setdefault(kind, [])
346 if handlers:
347 bisect.insort(handlers, handler)
348 else:
349 handlers.append(handler)
350 added = True
351
352 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000353 bisect.insort(self.handlers, handler)
354 handler.add_parent(self)
355
356 def close(self):
357 # Only exists for backwards compatibility.
358 pass
359
360 def _call_chain(self, chain, kind, meth_name, *args):
361 # Handlers raise an exception if no one else should try to handle
362 # the request, or return None if they can't but another handler
363 # could. Otherwise, they return the response.
364 handlers = chain.get(kind, ())
365 for handler in handlers:
366 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000367 result = func(*args)
368 if result is not None:
369 return result
370
371 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
372 # accept a URL or a Request object
373 if isinstance(fullurl, str):
374 req = Request(fullurl, data)
375 else:
376 req = fullurl
377 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000378 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000379
380 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000381 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000382
383 # pre-process request
384 meth_name = protocol+"_request"
385 for processor in self.process_request.get(protocol, []):
386 meth = getattr(processor, meth_name)
387 req = meth(req)
388
389 response = self._open(req, data)
390
391 # post-process response
392 meth_name = protocol+"_response"
393 for processor in self.process_response.get(protocol, []):
394 meth = getattr(processor, meth_name)
395 response = meth(req, response)
396
397 return response
398
399 def _open(self, req, data=None):
400 result = self._call_chain(self.handle_open, 'default',
401 'default_open', req)
402 if result:
403 return result
404
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000405 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000406 result = self._call_chain(self.handle_open, protocol, protocol +
407 '_open', req)
408 if result:
409 return result
410
411 return self._call_chain(self.handle_open, 'unknown',
412 'unknown_open', req)
413
414 def error(self, proto, *args):
415 if proto in ('http', 'https'):
416 # XXX http[s] protocols are special-cased
417 dict = self.handle_error['http'] # https is not different than http
418 proto = args[2] # YUCK!
419 meth_name = 'http_error_%s' % proto
420 http_err = 1
421 orig_args = args
422 else:
423 dict = self.handle_error
424 meth_name = proto + '_error'
425 http_err = 0
426 args = (dict, proto, meth_name) + args
427 result = self._call_chain(*args)
428 if result:
429 return result
430
431 if http_err:
432 args = (dict, 'default', 'http_error_default') + orig_args
433 return self._call_chain(*args)
434
435# XXX probably also want an abstract factory that knows when it makes
436# sense to skip a superclass in favor of a subclass and when it might
437# make sense to include both
438
439def build_opener(*handlers):
440 """Create an opener object from a list of handlers.
441
442 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000443 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000444
445 If any of the handlers passed as arguments are subclasses of the
446 default handlers, the default handlers will not be used.
447 """
448 def isclass(obj):
449 return isinstance(obj, type) or hasattr(obj, "__bases__")
450
451 opener = OpenerDirector()
452 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
453 HTTPDefaultErrorHandler, HTTPRedirectHandler,
454 FTPHandler, FileHandler, HTTPErrorProcessor]
455 if hasattr(http.client, "HTTPSConnection"):
456 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000457 skip = set()
458 for klass in default_classes:
459 for check in handlers:
460 if isclass(check):
461 if issubclass(check, klass):
462 skip.add(klass)
463 elif isinstance(check, klass):
464 skip.add(klass)
465 for klass in skip:
466 default_classes.remove(klass)
467
468 for klass in default_classes:
469 opener.add_handler(klass())
470
471 for h in handlers:
472 if isclass(h):
473 h = h()
474 opener.add_handler(h)
475 return opener
476
477class BaseHandler:
478 handler_order = 500
479
480 def add_parent(self, parent):
481 self.parent = parent
482
483 def close(self):
484 # Only exists for backwards compatibility
485 pass
486
487 def __lt__(self, other):
488 if not hasattr(other, "handler_order"):
489 # Try to preserve the old behavior of having custom classes
490 # inserted after default ones (works only for custom user
491 # classes which are not aware of handler_order).
492 return True
493 return self.handler_order < other.handler_order
494
495
496class HTTPErrorProcessor(BaseHandler):
497 """Process HTTP error responses."""
498 handler_order = 1000 # after all other processing
499
500 def http_response(self, request, response):
501 code, msg, hdrs = response.code, response.msg, response.info()
502
503 # According to RFC 2616, "2xx" code indicates that the client's
504 # request was successfully received, understood, and accepted.
505 if not (200 <= code < 300):
506 response = self.parent.error(
507 'http', request, response, code, msg, hdrs)
508
509 return response
510
511 https_response = http_response
512
513class HTTPDefaultErrorHandler(BaseHandler):
514 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516
517class HTTPRedirectHandler(BaseHandler):
518 # maximum number of redirections to any single URL
519 # this is needed because of the state that cookies introduce
520 max_repeats = 4
521 # maximum total number of redirections (regardless of URL) before
522 # assuming we're in a loop
523 max_redirections = 10
524
525 def redirect_request(self, req, fp, code, msg, headers, newurl):
526 """Return a Request or None in response to a redirect.
527
528 This is called by the http_error_30x methods when a
529 redirection response is received. If a redirection should
530 take place, return a new Request to allow http_error_30x to
531 perform the redirect. Otherwise, raise HTTPError if no-one
532 else should try to handle this url. Return None if you can't
533 but another Handler might.
534 """
535 m = req.get_method()
536 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
537 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000538 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000539
540 # Strictly (according to RFC 2616), 301 or 302 in response to
541 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000542 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000543 # essentially all clients do redirect in this case, so we do
544 # the same.
545 # be conciliant with URIs containing a space
546 newurl = newurl.replace(' ', '%20')
547 CONTENT_HEADERS = ("content-length", "content-type")
548 newheaders = dict((k, v) for k, v in req.headers.items()
549 if k.lower() not in CONTENT_HEADERS)
550 return Request(newurl,
551 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000552 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000553 unverifiable=True)
554
555 # Implementation note: To avoid the server sending us into an
556 # infinite loop, the request object needs to track what URLs we
557 # have already seen. Do this by adding a handler-specific
558 # attribute to the Request object.
559 def http_error_302(self, req, fp, code, msg, headers):
560 # Some servers (incorrectly) return multiple Location headers
561 # (so probably same goes for URI). Use first header.
562 if "location" in headers:
563 newurl = headers["location"]
564 elif "uri" in headers:
565 newurl = headers["uri"]
566 else:
567 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000568
569 # fix a possible malformed URL
570 urlparts = urlparse(newurl)
guido@google.coma119df92011-03-29 11:41:02 -0700571
572 # For security reasons we don't allow redirection to anything other
573 # than http, https or ftp.
574
Senthil Kumaran6497aa32012-01-04 13:46:59 +0800575 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800576 raise HTTPError(
577 newurl, code,
578 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
579 headers, fp)
guido@google.coma119df92011-03-29 11:41:02 -0700580
Facundo Batistaf24802c2008-08-17 03:36:03 +0000581 if not urlparts.path:
582 urlparts = list(urlparts)
583 urlparts[2] = "/"
584 newurl = urlunparse(urlparts)
585
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000586 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000587
588 # XXX Probably want to forget about the state of the current
589 # request, although that might interact poorly with other
590 # handlers that also use handler-specific request attributes
591 new = self.redirect_request(req, fp, code, msg, headers, newurl)
592 if new is None:
593 return
594
595 # loop detection
596 # .redirect_dict has a key url if url was previously visited.
597 if hasattr(req, 'redirect_dict'):
598 visited = new.redirect_dict = req.redirect_dict
599 if (visited.get(newurl, 0) >= self.max_repeats or
600 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000601 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000602 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000603 else:
604 visited = new.redirect_dict = req.redirect_dict = {}
605 visited[newurl] = visited.get(newurl, 0) + 1
606
607 # Don't close the fp until we are sure that we won't use it
608 # with HTTPError.
609 fp.read()
610 fp.close()
611
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000612 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000613
614 http_error_301 = http_error_303 = http_error_307 = http_error_302
615
616 inf_msg = "The HTTP server returned a redirect error that would " \
617 "lead to an infinite loop.\n" \
618 "The last 30x error message was:\n"
619
620
621def _parse_proxy(proxy):
622 """Return (scheme, user, password, host/port) given a URL or an authority.
623
624 If a URL is supplied, it must have an authority (host:port) component.
625 According to RFC 3986, having an authority component means the URL must
626 have two slashes after the scheme:
627
628 >>> _parse_proxy('file:/ftp.example.com/')
629 Traceback (most recent call last):
630 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
631
632 The first three items of the returned tuple may be None.
633
634 Examples of authority parsing:
635
636 >>> _parse_proxy('proxy.example.com')
637 (None, None, None, 'proxy.example.com')
638 >>> _parse_proxy('proxy.example.com:3128')
639 (None, None, None, 'proxy.example.com:3128')
640
641 The authority component may optionally include userinfo (assumed to be
642 username:password):
643
644 >>> _parse_proxy('joe:password@proxy.example.com')
645 (None, 'joe', 'password', 'proxy.example.com')
646 >>> _parse_proxy('joe:password@proxy.example.com:3128')
647 (None, 'joe', 'password', 'proxy.example.com:3128')
648
649 Same examples, but with URLs instead:
650
651 >>> _parse_proxy('http://proxy.example.com/')
652 ('http', None, None, 'proxy.example.com')
653 >>> _parse_proxy('http://proxy.example.com:3128/')
654 ('http', None, None, 'proxy.example.com:3128')
655 >>> _parse_proxy('http://joe:password@proxy.example.com/')
656 ('http', 'joe', 'password', 'proxy.example.com')
657 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
658 ('http', 'joe', 'password', 'proxy.example.com:3128')
659
660 Everything after the authority is ignored:
661
662 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
663 ('ftp', 'joe', 'password', 'proxy.example.com')
664
665 Test for no trailing '/' case:
666
667 >>> _parse_proxy('http://joe:password@proxy.example.com')
668 ('http', 'joe', 'password', 'proxy.example.com')
669
670 """
Georg Brandl13e89462008-07-01 19:56:00 +0000671 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000672 if not r_scheme.startswith("/"):
673 # authority
674 scheme = None
675 authority = proxy
676 else:
677 # URL
678 if not r_scheme.startswith("//"):
679 raise ValueError("proxy URL with no authority: %r" % proxy)
680 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
681 # and 3.3.), path is empty or starts with '/'
682 end = r_scheme.find("/", 2)
683 if end == -1:
684 end = None
685 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000686 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000687 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000688 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000689 else:
690 user = password = None
691 return scheme, user, password, hostport
692
693class ProxyHandler(BaseHandler):
694 # Proxies must be in front
695 handler_order = 100
696
697 def __init__(self, proxies=None):
698 if proxies is None:
699 proxies = getproxies()
700 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
701 self.proxies = proxies
702 for type, url in proxies.items():
703 setattr(self, '%s_open' % type,
704 lambda r, proxy=url, type=type, meth=self.proxy_open: \
705 meth(r, proxy, type))
706
707 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000708 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000709 proxy_type, user, password, hostport = _parse_proxy(proxy)
710 if proxy_type is None:
711 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000712
713 if req.host and proxy_bypass(req.host):
714 return None
715
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000716 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000717 user_pass = '%s:%s' % (unquote(user),
718 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000719 creds = base64.b64encode(user_pass.encode()).decode("ascii")
720 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000721 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000722 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000723 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000724 # let other handlers take care of it
725 return None
726 else:
727 # need to start over, because the other handlers don't
728 # grok the proxy's URL type
729 # e.g. if we have a constructor arg proxies like so:
730 # {'http': 'ftp://proxy.example.com'}, we may end up turning
731 # a request for http://acme.example.com/a into one for
732 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000733 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000734
735class HTTPPasswordMgr:
736
737 def __init__(self):
738 self.passwd = {}
739
740 def add_password(self, realm, uri, user, passwd):
741 # uri could be a single URI or a sequence
742 if isinstance(uri, str):
743 uri = [uri]
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800744 if realm not in self.passwd:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000745 self.passwd[realm] = {}
746 for default_port in True, False:
747 reduced_uri = tuple(
748 [self.reduce_uri(u, default_port) for u in uri])
749 self.passwd[realm][reduced_uri] = (user, passwd)
750
751 def find_user_password(self, realm, authuri):
752 domains = self.passwd.get(realm, {})
753 for default_port in True, False:
754 reduced_authuri = self.reduce_uri(authuri, default_port)
755 for uris, authinfo in domains.items():
756 for uri in uris:
757 if self.is_suburi(uri, reduced_authuri):
758 return authinfo
759 return None, None
760
761 def reduce_uri(self, uri, default_port=True):
762 """Accept authority or URI and extract only the authority and path."""
763 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000764 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000765 if parts[1]:
766 # URI
767 scheme = parts[0]
768 authority = parts[1]
769 path = parts[2] or '/'
770 else:
771 # host or host:port
772 scheme = None
773 authority = uri
774 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000775 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000776 if default_port and port is None and scheme is not None:
777 dport = {"http": 80,
778 "https": 443,
779 }.get(scheme)
780 if dport is not None:
781 authority = "%s:%d" % (host, dport)
782 return authority, path
783
784 def is_suburi(self, base, test):
785 """Check if test is below base in a URI tree
786
787 Both args must be URIs in reduced form.
788 """
789 if base == test:
790 return True
791 if base[0] != test[0]:
792 return False
793 common = posixpath.commonprefix((base[1], test[1]))
794 if len(common) == len(base[1]):
795 return True
796 return False
797
798
799class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
800
801 def find_user_password(self, realm, authuri):
802 user, password = HTTPPasswordMgr.find_user_password(self, realm,
803 authuri)
804 if user is not None:
805 return user, password
806 return HTTPPasswordMgr.find_user_password(self, None, authuri)
807
808
809class AbstractBasicAuthHandler:
810
811 # XXX this allows for multiple auth-schemes, but will stupidly pick
812 # the last one with a realm specified.
813
814 # allow for double- and single-quoted realm values
815 # (single quotes are a violation of the RFC, but appear in the wild)
816 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
817 'realm=(["\'])(.*?)\\2', re.I)
818
819 # XXX could pre-emptively send auth info already accepted (RFC 2617,
820 # end of section 2, and section 1.2 immediately after "credentials"
821 # production).
822
823 def __init__(self, password_mgr=None):
824 if password_mgr is None:
825 password_mgr = HTTPPasswordMgr()
826 self.passwd = password_mgr
827 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000828 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000829
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000830 def reset_retry_count(self):
831 self.retried = 0
832
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000833 def http_error_auth_reqed(self, authreq, host, req, headers):
834 # host may be an authority (without userinfo) or a URL with an
835 # authority
836 # XXX could be multiple headers
837 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000838
839 if self.retried > 5:
840 # retry sending the username:password 5 times before failing.
841 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
842 headers, None)
843 else:
844 self.retried += 1
845
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000846 if authreq:
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800847 scheme = authreq.split()[0]
Senthil Kumaran1a129c82011-10-20 02:50:13 +0800848 if scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800849 raise ValueError("AbstractBasicAuthHandler does not"
850 " support the following scheme: '%s'" %
851 scheme)
852 else:
853 mo = AbstractBasicAuthHandler.rx.search(authreq)
854 if mo:
855 scheme, quote, realm = mo.groups()
856 if scheme.lower() == 'basic':
857 response = self.retry_http_basic_auth(host, req, realm)
858 if response and response.code != 401:
859 self.retried = 0
860 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000861
862 def retry_http_basic_auth(self, host, req, realm):
863 user, pw = self.passwd.find_user_password(realm, host)
864 if pw is not None:
865 raw = "%s:%s" % (user, pw)
866 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
867 if req.headers.get(self.auth_header, None) == auth:
868 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000869 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000870 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000871 else:
872 return None
873
874
875class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
876
877 auth_header = 'Authorization'
878
879 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000880 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000881 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000882 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000883 self.reset_retry_count()
884 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000885
886
887class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
888
889 auth_header = 'Proxy-authorization'
890
891 def http_error_407(self, req, fp, code, msg, headers):
892 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000893 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000894 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
895 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000896 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000897 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000898 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000899 self.reset_retry_count()
900 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901
902
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800903# Return n random bytes.
904_randombytes = os.urandom
905
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000906
907class AbstractDigestAuthHandler:
908 # Digest authentication is specified in RFC 2617.
909
910 # XXX The client does not inspect the Authentication-Info header
911 # in a successful response.
912
913 # XXX It should be possible to test this implementation against
914 # a mock server that just generates a static set of challenges.
915
916 # XXX qop="auth-int" supports is shaky
917
918 def __init__(self, passwd=None):
919 if passwd is None:
920 passwd = HTTPPasswordMgr()
921 self.passwd = passwd
922 self.add_password = self.passwd.add_password
923 self.retried = 0
924 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000925 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000926
927 def reset_retry_count(self):
928 self.retried = 0
929
930 def http_error_auth_reqed(self, auth_header, host, req, headers):
931 authreq = headers.get(auth_header, None)
932 if self.retried > 5:
933 # Don't fail endlessly - if we failed once, we'll probably
934 # fail a second time. Hm. Unless the Password Manager is
935 # prompting for the information. Crap. This isn't great
936 # but it's better than the current 'repeat until recursion
937 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000938 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000939 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000940 else:
941 self.retried += 1
942 if authreq:
943 scheme = authreq.split()[0]
944 if scheme.lower() == 'digest':
945 return self.retry_http_digest_auth(req, authreq)
Senthil Kumaran1a129c82011-10-20 02:50:13 +0800946 elif scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800947 raise ValueError("AbstractDigestAuthHandler does not support"
948 " the following scheme: '%s'" % scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000949
950 def retry_http_digest_auth(self, req, auth):
951 token, challenge = auth.split(' ', 1)
952 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
953 auth = self.get_authorization(req, chal)
954 if auth:
955 auth_val = 'Digest %s' % auth
956 if req.headers.get(self.auth_header, None) == auth_val:
957 return None
958 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000959 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000960 return resp
961
962 def get_cnonce(self, nonce):
963 # The cnonce-value is an opaque
964 # quoted string value provided by the client and used by both client
965 # and server to avoid chosen plaintext attacks, to provide mutual
966 # authentication, and to provide some message integrity protection.
967 # This isn't a fabulous effort, but it's probably Good Enough.
968 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800969 b = s.encode("ascii") + _randombytes(8)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000970 dig = hashlib.sha1(b).hexdigest()
971 return dig[:16]
972
973 def get_authorization(self, req, chal):
974 try:
975 realm = chal['realm']
976 nonce = chal['nonce']
977 qop = chal.get('qop')
978 algorithm = chal.get('algorithm', 'MD5')
979 # mod_digest doesn't send an opaque, even though it isn't
980 # supposed to be optional
981 opaque = chal.get('opaque', None)
982 except KeyError:
983 return None
984
985 H, KD = self.get_algorithm_impls(algorithm)
986 if H is None:
987 return None
988
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000989 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000990 if user is None:
991 return None
992
993 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000994 if req.data is not None:
995 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000996 else:
997 entdig = None
998
999 A1 = "%s:%s:%s" % (user, realm, pw)
1000 A2 = "%s:%s" % (req.get_method(),
1001 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001002 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001003 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +00001004 if nonce == self.last_nonce:
1005 self.nonce_count += 1
1006 else:
1007 self.nonce_count = 1
1008 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001009 ncvalue = '%08x' % self.nonce_count
1010 cnonce = self.get_cnonce(nonce)
1011 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1012 respdig = KD(H(A1), noncebit)
1013 elif qop is None:
1014 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1015 else:
1016 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +00001017 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018
1019 # XXX should the partial digests be encoded too?
1020
1021 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001022 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001023 respdig)
1024 if opaque:
1025 base += ', opaque="%s"' % opaque
1026 if entdig:
1027 base += ', digest="%s"' % entdig
1028 base += ', algorithm="%s"' % algorithm
1029 if qop:
1030 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1031 return base
1032
1033 def get_algorithm_impls(self, algorithm):
1034 # lambdas assume digest modules are imported at the top level
1035 if algorithm == 'MD5':
1036 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1037 elif algorithm == 'SHA':
1038 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1039 # XXX MD5-sess
1040 KD = lambda s, d: H("%s:%s" % (s, d))
1041 return H, KD
1042
1043 def get_entity_digest(self, data, chal):
1044 # XXX not implemented yet
1045 return None
1046
1047
1048class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1049 """An authentication protocol defined by RFC 2069
1050
1051 Digest authentication improves on basic authentication because it
1052 does not transmit passwords in the clear.
1053 """
1054
1055 auth_header = 'Authorization'
1056 handler_order = 490 # before Basic auth
1057
1058 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001059 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001060 retry = self.http_error_auth_reqed('www-authenticate',
1061 host, req, headers)
1062 self.reset_retry_count()
1063 return retry
1064
1065
1066class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1067
1068 auth_header = 'Proxy-Authorization'
1069 handler_order = 490 # before Basic auth
1070
1071 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001072 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001073 retry = self.http_error_auth_reqed('proxy-authenticate',
1074 host, req, headers)
1075 self.reset_retry_count()
1076 return retry
1077
1078class AbstractHTTPHandler(BaseHandler):
1079
1080 def __init__(self, debuglevel=0):
1081 self._debuglevel = debuglevel
1082
1083 def set_http_debuglevel(self, level):
1084 self._debuglevel = level
1085
1086 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001087 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001088 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001089 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001090
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001091 if request.data is not None: # POST
1092 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001093 if isinstance(data, str):
1094 raise TypeError("POST data should be bytes"
1095 " or an iterable of bytes. It cannot be str.")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001096 if not request.has_header('Content-type'):
1097 request.add_unredirected_header(
1098 'Content-type',
1099 'application/x-www-form-urlencoded')
1100 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001101 try:
1102 mv = memoryview(data)
1103 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001104 if isinstance(data, collections.Iterable):
Georg Brandl61536042011-02-03 07:46:41 +00001105 raise ValueError("Content-Length should be specified "
1106 "for iterable data of type %r %r" % (type(data),
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001107 data))
1108 else:
1109 request.add_unredirected_header(
Senthil Kumaran1e991f22010-12-24 04:03:59 +00001110 'Content-length', '%d' % (len(mv) * mv.itemsize))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001111
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001112 sel_host = host
1113 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001114 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001115 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001116 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001117 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001118 for name, value in self.parent.addheaders:
1119 name = name.capitalize()
1120 if not request.has_header(name):
1121 request.add_unredirected_header(name, value)
1122
1123 return request
1124
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001125 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001126 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001127
1128 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001129 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001130 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001131 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001132 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001133
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001134 # will parse host:port
1135 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001136
1137 headers = dict(req.unredirected_hdrs)
1138 headers.update(dict((k, v) for k, v in req.headers.items()
1139 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001140
1141 # TODO(jhylton): Should this be redesigned to handle
1142 # persistent connections?
1143
1144 # We want to make an HTTP/1.1 request, but the addinfourl
1145 # class isn't prepared to deal with a persistent connection.
1146 # It will try to read all remaining data from the socket,
1147 # which will block while the server waits for the next request.
1148 # So make sure the connection gets closed after the (only)
1149 # request.
1150 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001151 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001152
1153 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001154 tunnel_headers = {}
1155 proxy_auth_hdr = "Proxy-Authorization"
1156 if proxy_auth_hdr in headers:
1157 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1158 # Proxy-Authorization should not be sent to origin
1159 # server.
1160 del headers[proxy_auth_hdr]
1161 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001162
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001163 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001164 h.request(req.get_method(), req.selector, req.data, headers)
Senthil Kumaran1299a8f2011-07-27 08:05:58 +08001165 except socket.error as err: # timeout error
Senthil Kumaran45686b42011-07-27 09:31:03 +08001166 h.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001167 raise URLError(err)
Senthil Kumaran45686b42011-07-27 09:31:03 +08001168 else:
1169 r = h.getresponse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001170
Senthil Kumaran26430412011-04-13 07:01:19 +08001171 r.url = req.get_full_url()
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001172 # This line replaces the .msg attribute of the HTTPResponse
1173 # with .headers, because urllib clients expect the response to
1174 # have the reason in .msg. It would be good to mark this
1175 # attribute is deprecated and get then to use info() or
1176 # .headers.
1177 r.msg = r.reason
1178 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001179
1180
1181class HTTPHandler(AbstractHTTPHandler):
1182
1183 def http_open(self, req):
1184 return self.do_open(http.client.HTTPConnection, req)
1185
1186 http_request = AbstractHTTPHandler.do_request_
1187
1188if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001189
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001190 class HTTPSHandler(AbstractHTTPHandler):
1191
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001192 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1193 AbstractHTTPHandler.__init__(self, debuglevel)
1194 self._context = context
1195 self._check_hostname = check_hostname
1196
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001197 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001198 return self.do_open(http.client.HTTPSConnection, req,
1199 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001200
1201 https_request = AbstractHTTPHandler.do_request_
1202
Senthil Kumaran4c875a92011-11-01 23:57:57 +08001203 __all__.append('HTTPSHandler')
Senthil Kumaran0d54eb92011-11-01 23:49:46 +08001204
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001205class HTTPCookieProcessor(BaseHandler):
1206 def __init__(self, cookiejar=None):
1207 import http.cookiejar
1208 if cookiejar is None:
1209 cookiejar = http.cookiejar.CookieJar()
1210 self.cookiejar = cookiejar
1211
1212 def http_request(self, request):
1213 self.cookiejar.add_cookie_header(request)
1214 return request
1215
1216 def http_response(self, request, response):
1217 self.cookiejar.extract_cookies(response, request)
1218 return response
1219
1220 https_request = http_request
1221 https_response = http_response
1222
1223class UnknownHandler(BaseHandler):
1224 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001225 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001226 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001227
1228def parse_keqv_list(l):
1229 """Parse list of key=value strings where keys are not duplicated."""
1230 parsed = {}
1231 for elt in l:
1232 k, v = elt.split('=', 1)
1233 if v[0] == '"' and v[-1] == '"':
1234 v = v[1:-1]
1235 parsed[k] = v
1236 return parsed
1237
1238def parse_http_list(s):
1239 """Parse lists as described by RFC 2068 Section 2.
1240
1241 In particular, parse comma-separated lists where the elements of
1242 the list may include quoted-strings. A quoted-string could
1243 contain a comma. A non-quoted string could have quotes in the
1244 middle. Neither commas nor quotes count if they are escaped.
1245 Only double-quotes count, not single-quotes.
1246 """
1247 res = []
1248 part = ''
1249
1250 escape = quote = False
1251 for cur in s:
1252 if escape:
1253 part += cur
1254 escape = False
1255 continue
1256 if quote:
1257 if cur == '\\':
1258 escape = True
1259 continue
1260 elif cur == '"':
1261 quote = False
1262 part += cur
1263 continue
1264
1265 if cur == ',':
1266 res.append(part)
1267 part = ''
1268 continue
1269
1270 if cur == '"':
1271 quote = True
1272
1273 part += cur
1274
1275 # append last part
1276 if part:
1277 res.append(part)
1278
1279 return [part.strip() for part in res]
1280
1281class FileHandler(BaseHandler):
1282 # Use local file or FTP depending on form of URL
1283 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001284 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001285 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1286 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001287 if not req.host is self.get_names():
1288 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001289 else:
1290 return self.open_local_file(req)
1291
1292 # names for the localhost
1293 names = None
1294 def get_names(self):
1295 if FileHandler.names is None:
1296 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001297 FileHandler.names = tuple(
1298 socket.gethostbyname_ex('localhost')[2] +
1299 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001300 except socket.gaierror:
1301 FileHandler.names = (socket.gethostbyname('localhost'),)
1302 return FileHandler.names
1303
1304 # not entirely sure what the rules are here
1305 def open_local_file(self, req):
1306 import email.utils
1307 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001308 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001309 filename = req.selector
1310 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001311 try:
1312 stats = os.stat(localfile)
1313 size = stats.st_size
1314 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001315 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001316 headers = email.message_from_string(
1317 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1318 (mtype or 'text/plain', size, modified))
1319 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001320 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001321 if not host or \
1322 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001323 if host:
1324 origurl = 'file://' + host + filename
1325 else:
1326 origurl = 'file://' + filename
1327 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001328 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001329 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001330 raise URLError(msg)
1331 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001332
1333def _safe_gethostbyname(host):
1334 try:
1335 return socket.gethostbyname(host)
1336 except socket.gaierror:
1337 return None
1338
1339class FTPHandler(BaseHandler):
1340 def ftp_open(self, req):
1341 import ftplib
1342 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001343 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001344 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001345 raise URLError('ftp error: no host given')
1346 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001347 if port is None:
1348 port = ftplib.FTP_PORT
1349 else:
1350 port = int(port)
1351
1352 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001353 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001354 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001355 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001356 else:
1357 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001358 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001359 user = user or ''
1360 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001361
1362 try:
1363 host = socket.gethostbyname(host)
1364 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001365 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001366 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001367 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001368 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001369 dirs, file = dirs[:-1], dirs[-1]
1370 if dirs and not dirs[0]:
1371 dirs = dirs[1:]
1372 try:
1373 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1374 type = file and 'I' or 'D'
1375 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001376 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001377 if attr.lower() == 'type' and \
1378 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1379 type = value.upper()
1380 fp, retrlen = fw.retrfile(file, type)
1381 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001382 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001383 if mtype:
1384 headers += "Content-type: %s\n" % mtype
1385 if retrlen is not None and retrlen >= 0:
1386 headers += "Content-length: %d\n" % retrlen
1387 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001388 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001389 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001390 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001391 raise exc.with_traceback(sys.exc_info()[2])
1392
1393 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001394 return ftpwrapper(user, passwd, host, port, dirs, timeout,
1395 persistent=False)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001396
1397class CacheFTPHandler(FTPHandler):
1398 # XXX would be nice to have pluggable cache strategies
1399 # XXX this stuff is definitely not thread safe
1400 def __init__(self):
1401 self.cache = {}
1402 self.timeout = {}
1403 self.soonest = 0
1404 self.delay = 60
1405 self.max_conns = 16
1406
1407 def setTimeout(self, t):
1408 self.delay = t
1409
1410 def setMaxConns(self, m):
1411 self.max_conns = m
1412
1413 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1414 key = user, host, port, '/'.join(dirs), timeout
1415 if key in self.cache:
1416 self.timeout[key] = time.time() + self.delay
1417 else:
1418 self.cache[key] = ftpwrapper(user, passwd, host, port,
1419 dirs, timeout)
1420 self.timeout[key] = time.time() + self.delay
1421 self.check_cache()
1422 return self.cache[key]
1423
1424 def check_cache(self):
1425 # first check for old ones
1426 t = time.time()
1427 if self.soonest <= t:
1428 for k, v in list(self.timeout.items()):
1429 if v < t:
1430 self.cache[k].close()
1431 del self.cache[k]
1432 del self.timeout[k]
1433 self.soonest = min(list(self.timeout.values()))
1434
1435 # then check the size
1436 if len(self.cache) == self.max_conns:
1437 for k, v in list(self.timeout.items()):
1438 if v == self.soonest:
1439 del self.cache[k]
1440 del self.timeout[k]
1441 break
1442 self.soonest = min(list(self.timeout.values()))
1443
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001444 def clear_cache(self):
1445 for conn in self.cache.values():
1446 conn.close()
1447 self.cache.clear()
1448 self.timeout.clear()
1449
1450
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001451# Code move from the old urllib module
1452
1453MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1454
1455# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001456if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001457 from nturl2path import url2pathname, pathname2url
1458else:
1459 def url2pathname(pathname):
1460 """OS-specific conversion from a relative URL of the 'file' scheme
1461 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001462 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001463
1464 def pathname2url(pathname):
1465 """OS-specific conversion from a file system path to a relative URL
1466 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001467 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001468
1469# This really consists of two pieces:
1470# (1) a class which handles opening of all sorts of URLs
1471# (plus assorted utilities etc.)
1472# (2) a set of functions for parsing URLs
1473# XXX Should these be separated out into different modules?
1474
1475
1476ftpcache = {}
1477class URLopener:
1478 """Class to open URLs.
1479 This is a class rather than just a subroutine because we may need
1480 more than one set of global protocol-specific options.
1481 Note -- this is a base class for those who don't want the
1482 automatic handling of errors type 302 (relocated) and 401
1483 (authorization needed)."""
1484
1485 __tempfiles = None
1486
1487 version = "Python-urllib/%s" % __version__
1488
1489 # Constructor
1490 def __init__(self, proxies=None, **x509):
1491 if proxies is None:
1492 proxies = getproxies()
1493 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1494 self.proxies = proxies
1495 self.key_file = x509.get('key_file')
1496 self.cert_file = x509.get('cert_file')
1497 self.addheaders = [('User-Agent', self.version)]
1498 self.__tempfiles = []
1499 self.__unlink = os.unlink # See cleanup()
1500 self.tempcache = None
1501 # Undocumented feature: if you assign {} to tempcache,
1502 # it is used to cache files retrieved with
1503 # self.retrieve(). This is not enabled by default
1504 # since it does not work for changing documents (and I
1505 # haven't got the logic to check expiration headers
1506 # yet).
1507 self.ftpcache = ftpcache
1508 # Undocumented feature: you can use a different
1509 # ftp cache by assigning to the .ftpcache member;
1510 # in case you want logically independent URL openers
1511 # XXX This is not threadsafe. Bah.
1512
1513 def __del__(self):
1514 self.close()
1515
1516 def close(self):
1517 self.cleanup()
1518
1519 def cleanup(self):
1520 # This code sometimes runs when the rest of this module
1521 # has already been deleted, so it can't use any globals
1522 # or import anything.
1523 if self.__tempfiles:
1524 for file in self.__tempfiles:
1525 try:
1526 self.__unlink(file)
1527 except OSError:
1528 pass
1529 del self.__tempfiles[:]
1530 if self.tempcache:
1531 self.tempcache.clear()
1532
1533 def addheader(self, *args):
1534 """Add a header to be used by the HTTP interface only
1535 e.g. u.addheader('Accept', 'sound/basic')"""
1536 self.addheaders.append(args)
1537
1538 # External interface
1539 def open(self, fullurl, data=None):
1540 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001541 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001542 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001543 if self.tempcache and fullurl in self.tempcache:
1544 filename, headers = self.tempcache[fullurl]
1545 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001546 return addinfourl(fp, headers, fullurl)
1547 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001548 if not urltype:
1549 urltype = 'file'
1550 if urltype in self.proxies:
1551 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001552 urltype, proxyhost = splittype(proxy)
1553 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001554 url = (host, fullurl) # Signal special case to open_*()
1555 else:
1556 proxy = None
1557 name = 'open_' + urltype
1558 self.type = urltype
1559 name = name.replace('-', '_')
1560 if not hasattr(self, name):
1561 if proxy:
1562 return self.open_unknown_proxy(proxy, fullurl, data)
1563 else:
1564 return self.open_unknown(fullurl, data)
1565 try:
1566 if data is None:
1567 return getattr(self, name)(url)
1568 else:
1569 return getattr(self, name)(url, data)
Antoine Pitrou6b4883d2011-10-12 02:54:14 +02001570 except HTTPError:
1571 raise
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001572 except socket.error as msg:
1573 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1574
1575 def open_unknown(self, fullurl, data=None):
1576 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001577 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001578 raise IOError('url error', 'unknown url type', type)
1579
1580 def open_unknown_proxy(self, proxy, fullurl, data=None):
1581 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001582 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001583 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1584
1585 # External interface
1586 def retrieve(self, url, filename=None, reporthook=None, data=None):
1587 """retrieve(url) returns (filename, headers) for a local object
1588 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001589 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001590 if self.tempcache and url in self.tempcache:
1591 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001592 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001593 if filename is None and (not type or type == 'file'):
1594 try:
1595 fp = self.open_local_file(url1)
1596 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001597 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001598 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001599 except IOError as msg:
1600 pass
1601 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001602 try:
1603 headers = fp.info()
1604 if filename:
1605 tfp = open(filename, 'wb')
1606 else:
1607 import tempfile
1608 garbage, path = splittype(url)
1609 garbage, path = splithost(path or "")
1610 path, garbage = splitquery(path or "")
1611 path, garbage = splitattr(path or "")
1612 suffix = os.path.splitext(path)[1]
1613 (fd, filename) = tempfile.mkstemp(suffix)
1614 self.__tempfiles.append(filename)
1615 tfp = os.fdopen(fd, 'wb')
1616 try:
1617 result = filename, headers
1618 if self.tempcache is not None:
1619 self.tempcache[url] = result
1620 bs = 1024*8
1621 size = -1
1622 read = 0
1623 blocknum = 0
Senthil Kumarance260142011-11-01 01:35:17 +08001624 if "content-length" in headers:
1625 size = int(headers["Content-Length"])
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001626 if reporthook:
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001627 reporthook(blocknum, bs, size)
1628 while 1:
1629 block = fp.read(bs)
1630 if not block:
1631 break
1632 read += len(block)
1633 tfp.write(block)
1634 blocknum += 1
1635 if reporthook:
1636 reporthook(blocknum, bs, size)
1637 finally:
1638 tfp.close()
1639 finally:
1640 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001641
1642 # raise exception if actual size does not match content-length header
1643 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001644 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001645 "retrieval incomplete: got only %i out of %i bytes"
1646 % (read, size), result)
1647
1648 return result
1649
1650 # Each method named open_<type> knows how to open that type of URL
1651
1652 def _open_generic_http(self, connection_factory, url, data):
1653 """Make an HTTP connection using connection_class.
1654
1655 This is an internal method that should be called from
1656 open_http() or open_https().
1657
1658 Arguments:
1659 - connection_factory should take a host name and return an
1660 HTTPConnection instance.
1661 - url is the url to retrieval or a host, relative-path pair.
1662 - data is payload for a POST request or None.
1663 """
1664
1665 user_passwd = None
1666 proxy_passwd= None
1667 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001668 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001669 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001670 user_passwd, host = splituser(host)
1671 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001672 realhost = host
1673 else:
1674 host, selector = url
1675 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001676 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001677 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001678 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001679 url = rest
1680 user_passwd = None
1681 if urltype.lower() != 'http':
1682 realhost = None
1683 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001684 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001685 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001686 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001687 if user_passwd:
1688 selector = "%s://%s%s" % (urltype, realhost, rest)
1689 if proxy_bypass(realhost):
1690 host = realhost
1691
1692 #print "proxy via http:", host, selector
1693 if not host: raise IOError('http error', 'no host given')
1694
1695 if proxy_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001696 proxy_passwd = unquote(proxy_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001697 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001698 else:
1699 proxy_auth = None
1700
1701 if user_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001702 user_passwd = unquote(user_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001703 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001704 else:
1705 auth = None
1706 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001707 headers = {}
1708 if proxy_auth:
1709 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1710 if auth:
1711 headers["Authorization"] = "Basic %s" % auth
1712 if realhost:
1713 headers["Host"] = realhost
Senthil Kumarand91ffca2011-03-19 17:25:27 +08001714
1715 # Add Connection:close as we don't support persistent connections yet.
1716 # This helps in closing the socket and avoiding ResourceWarning
1717
1718 headers["Connection"] = "close"
1719
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001720 for header, value in self.addheaders:
1721 headers[header] = value
1722
1723 if data is not None:
1724 headers["Content-Type"] = "application/x-www-form-urlencoded"
1725 http_conn.request("POST", selector, data, headers)
1726 else:
1727 http_conn.request("GET", selector, headers=headers)
1728
1729 try:
1730 response = http_conn.getresponse()
1731 except http.client.BadStatusLine:
1732 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001733 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001734
1735 # According to RFC 2616, "2xx" code indicates that the client's
1736 # request was successfully received, understood, and accepted.
1737 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001738 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001739 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001740 else:
1741 return self.http_error(
1742 url, response.fp,
1743 response.status, response.reason, response.msg, data)
1744
1745 def open_http(self, url, data=None):
1746 """Use HTTP protocol."""
1747 return self._open_generic_http(http.client.HTTPConnection, url, data)
1748
1749 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1750 """Handle http errors.
1751
1752 Derived class can override this, or provide specific handlers
1753 named http_error_DDD where DDD is the 3-digit error code."""
1754 # First check if there's a specific handler for this error
1755 name = 'http_error_%d' % errcode
1756 if hasattr(self, name):
1757 method = getattr(self, name)
1758 if data is None:
1759 result = method(url, fp, errcode, errmsg, headers)
1760 else:
1761 result = method(url, fp, errcode, errmsg, headers, data)
1762 if result: return result
1763 return self.http_error_default(url, fp, errcode, errmsg, headers)
1764
1765 def http_error_default(self, url, fp, errcode, errmsg, headers):
1766 """Default error handler: close the connection and raise IOError."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001767 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001768 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001769
1770 if _have_ssl:
1771 def _https_connection(self, host):
1772 return http.client.HTTPSConnection(host,
1773 key_file=self.key_file,
1774 cert_file=self.cert_file)
1775
1776 def open_https(self, url, data=None):
1777 """Use HTTPS protocol."""
1778 return self._open_generic_http(self._https_connection, url, data)
1779
1780 def open_file(self, url):
1781 """Use local file or FTP depending on form of URL."""
1782 if not isinstance(url, str):
1783 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1784 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001785 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001786 else:
1787 return self.open_local_file(url)
1788
1789 def open_local_file(self, url):
1790 """Use local file."""
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001791 import email.utils
1792 import mimetypes
Georg Brandl13e89462008-07-01 19:56:00 +00001793 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001794 localname = url2pathname(file)
1795 try:
1796 stats = os.stat(localname)
1797 except OSError as e:
1798 raise URLError(e.errno, e.strerror, e.filename)
1799 size = stats.st_size
1800 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1801 mtype = mimetypes.guess_type(url)[0]
1802 headers = email.message_from_string(
1803 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1804 (mtype or 'text/plain', size, modified))
1805 if not host:
1806 urlfile = file
1807 if file[:1] == '/':
1808 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001809 return addinfourl(open(localname, 'rb'), headers, urlfile)
1810 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001811 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001812 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001813 urlfile = file
1814 if file[:1] == '/':
1815 urlfile = 'file://' + file
Senthil Kumaran3800ea92012-01-21 11:52:48 +08001816 elif file[:2] == './':
1817 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
Georg Brandl13e89462008-07-01 19:56:00 +00001818 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001819 raise URLError('local file error', 'not on local host')
1820
1821 def open_ftp(self, url):
1822 """Use FTP protocol."""
1823 if not isinstance(url, str):
1824 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1825 import mimetypes
Georg Brandl13e89462008-07-01 19:56:00 +00001826 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001827 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001828 host, port = splitport(host)
1829 user, host = splituser(host)
1830 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001831 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001832 host = unquote(host)
1833 user = unquote(user or '')
1834 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001835 host = socket.gethostbyname(host)
1836 if not port:
1837 import ftplib
1838 port = ftplib.FTP_PORT
1839 else:
1840 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001841 path, attrs = splitattr(path)
1842 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001843 dirs = path.split('/')
1844 dirs, file = dirs[:-1], dirs[-1]
1845 if dirs and not dirs[0]: dirs = dirs[1:]
1846 if dirs and not dirs[0]: dirs[0] = '/'
1847 key = user, host, port, '/'.join(dirs)
1848 # XXX thread unsafe!
1849 if len(self.ftpcache) > MAXFTPCACHE:
1850 # Prune the cache, rather arbitrarily
1851 for k in self.ftpcache.keys():
1852 if k != key:
1853 v = self.ftpcache[k]
1854 del self.ftpcache[k]
1855 v.close()
1856 try:
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001857 if key not in self.ftpcache:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001858 self.ftpcache[key] = \
1859 ftpwrapper(user, passwd, host, port, dirs)
1860 if not file: type = 'D'
1861 else: type = 'I'
1862 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001863 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001864 if attr.lower() == 'type' and \
1865 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1866 type = value.upper()
1867 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1868 mtype = mimetypes.guess_type("ftp:" + url)[0]
1869 headers = ""
1870 if mtype:
1871 headers += "Content-Type: %s\n" % mtype
1872 if retrlen is not None and retrlen >= 0:
1873 headers += "Content-Length: %d\n" % retrlen
1874 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001875 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001876 except ftperrors() as msg:
1877 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1878
1879 def open_data(self, url, data=None):
1880 """Use "data" URL."""
1881 if not isinstance(url, str):
1882 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1883 # ignore POSTed data
1884 #
1885 # syntax of data URLs:
1886 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1887 # mediatype := [ type "/" subtype ] *( ";" parameter )
1888 # data := *urlchar
1889 # parameter := attribute "=" value
1890 try:
1891 [type, data] = url.split(',', 1)
1892 except ValueError:
1893 raise IOError('data error', 'bad data URL')
1894 if not type:
1895 type = 'text/plain;charset=US-ASCII'
1896 semi = type.rfind(';')
1897 if semi >= 0 and '=' not in type[semi:]:
1898 encoding = type[semi+1:]
1899 type = type[:semi]
1900 else:
1901 encoding = ''
1902 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001903 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001904 time.gmtime(time.time())))
1905 msg.append('Content-type: %s' % type)
1906 if encoding == 'base64':
Georg Brandl706824f2009-06-04 09:42:55 +00001907 # XXX is this encoding/decoding ok?
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001908 data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001909 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001910 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001911 msg.append('Content-Length: %d' % len(data))
1912 msg.append('')
1913 msg.append(data)
1914 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001915 headers = email.message_from_string(msg)
1916 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001917 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001918 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001919
1920
1921class FancyURLopener(URLopener):
1922 """Derived class with handlers for errors we can handle (perhaps)."""
1923
1924 def __init__(self, *args, **kwargs):
1925 URLopener.__init__(self, *args, **kwargs)
1926 self.auth_cache = {}
1927 self.tries = 0
1928 self.maxtries = 10
1929
1930 def http_error_default(self, url, fp, errcode, errmsg, headers):
1931 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001932 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001933
1934 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1935 """Error 302 -- relocated (temporarily)."""
1936 self.tries += 1
1937 if self.maxtries and self.tries >= self.maxtries:
1938 if hasattr(self, "http_error_500"):
1939 meth = self.http_error_500
1940 else:
1941 meth = self.http_error_default
1942 self.tries = 0
1943 return meth(url, fp, 500,
1944 "Internal Server Error: Redirect Recursion", headers)
1945 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1946 data)
1947 self.tries = 0
1948 return result
1949
1950 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1951 if 'location' in headers:
1952 newurl = headers['location']
1953 elif 'uri' in headers:
1954 newurl = headers['uri']
1955 else:
1956 return
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001957 fp.close()
guido@google.coma119df92011-03-29 11:41:02 -07001958
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001959 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001960 newurl = urljoin(self.type + ":" + url, newurl)
guido@google.coma119df92011-03-29 11:41:02 -07001961
1962 urlparts = urlparse(newurl)
1963
1964 # For security reasons, we don't allow redirection to anything other
1965 # than http, https and ftp.
1966
1967 # We are using newer HTTPError with older redirect_internal method
1968 # This older method will get deprecated in 3.3
1969
Senthil Kumaran6497aa32012-01-04 13:46:59 +08001970 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
guido@google.coma119df92011-03-29 11:41:02 -07001971 raise HTTPError(newurl, errcode,
1972 errmsg +
1973 " Redirection to url '%s' is not allowed." % newurl,
1974 headers, fp)
1975
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001976 return self.open(newurl)
1977
1978 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1979 """Error 301 -- also relocated (permanently)."""
1980 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1981
1982 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1983 """Error 303 -- also relocated (essentially identical to 302)."""
1984 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1985
1986 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1987 """Error 307 -- relocated, but turn POST into error."""
1988 if data is None:
1989 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1990 else:
1991 return self.http_error_default(url, fp, errcode, errmsg, headers)
1992
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001993 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1994 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001995 """Error 401 -- authentication required.
1996 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001997 if 'www-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001998 URLopener.http_error_default(self, url, fp,
1999 errcode, errmsg, headers)
2000 stuff = headers['www-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002001 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2002 if not match:
2003 URLopener.http_error_default(self, url, fp,
2004 errcode, errmsg, headers)
2005 scheme, realm = match.groups()
2006 if scheme.lower() != 'basic':
2007 URLopener.http_error_default(self, url, fp,
2008 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002009 if not retry:
2010 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2011 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002012 name = 'retry_' + self.type + '_basic_auth'
2013 if data is None:
2014 return getattr(self,name)(url, realm)
2015 else:
2016 return getattr(self,name)(url, realm, data)
2017
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002018 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2019 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002020 """Error 407 -- proxy authentication required.
2021 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002022 if 'proxy-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002023 URLopener.http_error_default(self, url, fp,
2024 errcode, errmsg, headers)
2025 stuff = headers['proxy-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002026 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2027 if not match:
2028 URLopener.http_error_default(self, url, fp,
2029 errcode, errmsg, headers)
2030 scheme, realm = match.groups()
2031 if scheme.lower() != 'basic':
2032 URLopener.http_error_default(self, url, fp,
2033 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002034 if not retry:
2035 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2036 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002037 name = 'retry_proxy_' + self.type + '_basic_auth'
2038 if data is None:
2039 return getattr(self,name)(url, realm)
2040 else:
2041 return getattr(self,name)(url, realm, data)
2042
2043 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002044 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002045 newurl = 'http://' + host + selector
2046 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00002047 urltype, proxyhost = splittype(proxy)
2048 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002049 i = proxyhost.find('@') + 1
2050 proxyhost = proxyhost[i:]
2051 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2052 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002053 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002054 quote(passwd, safe=''), proxyhost)
2055 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2056 if data is None:
2057 return self.open(newurl)
2058 else:
2059 return self.open(newurl, data)
2060
2061 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002062 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002063 newurl = 'https://' + host + selector
2064 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00002065 urltype, proxyhost = splittype(proxy)
2066 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002067 i = proxyhost.find('@') + 1
2068 proxyhost = proxyhost[i:]
2069 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2070 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002071 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002072 quote(passwd, safe=''), proxyhost)
2073 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2074 if data is None:
2075 return self.open(newurl)
2076 else:
2077 return self.open(newurl, data)
2078
2079 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002080 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002081 i = host.find('@') + 1
2082 host = host[i:]
2083 user, passwd = self.get_user_passwd(host, realm, i)
2084 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002085 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002086 quote(passwd, safe=''), host)
2087 newurl = 'http://' + host + selector
2088 if data is None:
2089 return self.open(newurl)
2090 else:
2091 return self.open(newurl, data)
2092
2093 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002094 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002095 i = host.find('@') + 1
2096 host = host[i:]
2097 user, passwd = self.get_user_passwd(host, realm, i)
2098 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002099 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002100 quote(passwd, safe=''), host)
2101 newurl = 'https://' + host + selector
2102 if data is None:
2103 return self.open(newurl)
2104 else:
2105 return self.open(newurl, data)
2106
Florent Xicluna757445b2010-05-17 17:24:07 +00002107 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002108 key = realm + '@' + host.lower()
2109 if key in self.auth_cache:
2110 if clear_cache:
2111 del self.auth_cache[key]
2112 else:
2113 return self.auth_cache[key]
2114 user, passwd = self.prompt_user_passwd(host, realm)
2115 if user or passwd: self.auth_cache[key] = (user, passwd)
2116 return user, passwd
2117
2118 def prompt_user_passwd(self, host, realm):
2119 """Override this in a GUI environment!"""
2120 import getpass
2121 try:
2122 user = input("Enter username for %s at %s: " % (realm, host))
2123 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2124 (user, realm, host))
2125 return user, passwd
2126 except KeyboardInterrupt:
2127 print()
2128 return None, None
2129
2130
2131# Utility functions
2132
2133_localhost = None
2134def localhost():
2135 """Return the IP address of the magic hostname 'localhost'."""
2136 global _localhost
2137 if _localhost is None:
2138 _localhost = socket.gethostbyname('localhost')
2139 return _localhost
2140
2141_thishost = None
2142def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002143 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002144 global _thishost
2145 if _thishost is None:
Senthil Kumaran1b7da512011-10-06 00:32:02 +08002146 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002147 return _thishost
2148
2149_ftperrors = None
2150def ftperrors():
2151 """Return the set of errors raised by the FTP class."""
2152 global _ftperrors
2153 if _ftperrors is None:
2154 import ftplib
2155 _ftperrors = ftplib.all_errors
2156 return _ftperrors
2157
2158_noheaders = None
2159def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002160 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002161 global _noheaders
2162 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002163 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002164 return _noheaders
2165
2166
2167# Utility classes
2168
2169class ftpwrapper:
2170 """Class used by open_ftp() for cache of open FTP connections."""
2171
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002172 def __init__(self, user, passwd, host, port, dirs, timeout=None,
2173 persistent=True):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002174 self.user = user
2175 self.passwd = passwd
2176 self.host = host
2177 self.port = port
2178 self.dirs = dirs
2179 self.timeout = timeout
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002180 self.refcount = 0
2181 self.keepalive = persistent
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002182 self.init()
2183
2184 def init(self):
2185 import ftplib
2186 self.busy = 0
2187 self.ftp = ftplib.FTP()
2188 self.ftp.connect(self.host, self.port, self.timeout)
2189 self.ftp.login(self.user, self.passwd)
2190 for dir in self.dirs:
2191 self.ftp.cwd(dir)
2192
2193 def retrfile(self, file, type):
2194 import ftplib
2195 self.endtransfer()
2196 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2197 else: cmd = 'TYPE ' + type; isdir = 0
2198 try:
2199 self.ftp.voidcmd(cmd)
2200 except ftplib.all_errors:
2201 self.init()
2202 self.ftp.voidcmd(cmd)
2203 conn = None
2204 if file and not isdir:
2205 # Try to retrieve as a file
2206 try:
2207 cmd = 'RETR ' + file
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002208 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002209 except ftplib.error_perm as reason:
2210 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002211 raise URLError('ftp error', reason).with_traceback(
2212 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002213 if not conn:
2214 # Set transfer mode to ASCII!
2215 self.ftp.voidcmd('TYPE A')
2216 # Try a directory listing. Verify that directory exists.
2217 if file:
2218 pwd = self.ftp.pwd()
2219 try:
2220 try:
2221 self.ftp.cwd(file)
2222 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002223 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002224 finally:
2225 self.ftp.cwd(pwd)
2226 cmd = 'LIST ' + file
2227 else:
2228 cmd = 'LIST'
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002229 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002230 self.busy = 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002231
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002232 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2233 self.refcount += 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002234 conn.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002235 # Pass back both a suitably decorated object and a retrieval length
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002236 return (ftpobj, retrlen)
2237
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002238 def endtransfer(self):
2239 if not self.busy:
2240 return
2241 self.busy = 0
2242 try:
2243 self.ftp.voidresp()
2244 except ftperrors():
2245 pass
2246
2247 def close(self):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002248 self.keepalive = False
2249 if self.refcount <= 0:
2250 self.real_close()
2251
2252 def file_close(self):
2253 self.endtransfer()
2254 self.refcount -= 1
2255 if self.refcount <= 0 and not self.keepalive:
2256 self.real_close()
2257
2258 def real_close(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002259 self.endtransfer()
2260 try:
2261 self.ftp.close()
2262 except ftperrors():
2263 pass
2264
2265# Proxy handling
2266def getproxies_environment():
2267 """Return a dictionary of scheme -> proxy server URL mappings.
2268
2269 Scan the environment for variables named <scheme>_proxy;
2270 this seems to be the standard convention. If you need a
2271 different way, you can pass a proxies dictionary to the
2272 [Fancy]URLopener constructor.
2273
2274 """
2275 proxies = {}
2276 for name, value in os.environ.items():
2277 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002278 if value and name[-6:] == '_proxy':
2279 proxies[name[:-6]] = value
2280 return proxies
2281
2282def proxy_bypass_environment(host):
2283 """Test if proxies should not be used for a particular host.
2284
2285 Checks the environment for a variable named no_proxy, which should
2286 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2287 """
2288 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2289 # '*' is special case for always bypass
2290 if no_proxy == '*':
2291 return 1
2292 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002293 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002294 # check if the host ends with any of the DNS suffixes
Senthil Kumaran89976f12011-08-06 12:27:40 +08002295 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2296 for name in no_proxy_list:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002297 if name and (hostonly.endswith(name) or host.endswith(name)):
2298 return 1
2299 # otherwise, don't bypass
2300 return 0
2301
2302
Ronald Oussorene72e1612011-03-14 18:15:25 -04002303# This code tests an OSX specific data structure but is testable on all
2304# platforms
2305def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2306 """
2307 Return True iff this host shouldn't be accessed using a proxy
2308
2309 This function uses the MacOSX framework SystemConfiguration
2310 to fetch the proxy information.
2311
2312 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2313 { 'exclude_simple': bool,
2314 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2315 }
2316 """
Ronald Oussorene72e1612011-03-14 18:15:25 -04002317 from fnmatch import fnmatch
2318
2319 hostonly, port = splitport(host)
2320
2321 def ip2num(ipAddr):
2322 parts = ipAddr.split('.')
2323 parts = list(map(int, parts))
2324 if len(parts) != 4:
2325 parts = (parts + [0, 0, 0, 0])[:4]
2326 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2327
2328 # Check for simple host names:
2329 if '.' not in host:
2330 if proxy_settings['exclude_simple']:
2331 return True
2332
2333 hostIP = None
2334
2335 for value in proxy_settings.get('exceptions', ()):
2336 # Items in the list are strings like these: *.local, 169.254/16
2337 if not value: continue
2338
2339 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2340 if m is not None:
2341 if hostIP is None:
2342 try:
2343 hostIP = socket.gethostbyname(hostonly)
2344 hostIP = ip2num(hostIP)
2345 except socket.error:
2346 continue
2347
2348 base = ip2num(m.group(1))
2349 mask = m.group(2)
2350 if mask is None:
2351 mask = 8 * (m.group(1).count('.') + 1)
2352 else:
2353 mask = int(mask[1:])
2354 mask = 32 - mask
2355
2356 if (hostIP >> mask) == (base >> mask):
2357 return True
2358
2359 elif fnmatch(host, value):
2360 return True
2361
2362 return False
2363
2364
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002365if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002366 from _scproxy import _get_proxy_settings, _get_proxies
2367
2368 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren84151202010-04-18 20:46:11 +00002369 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002370 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren84151202010-04-18 20:46:11 +00002371
2372 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002373 """Return a dictionary of scheme -> proxy server URL mappings.
2374
Ronald Oussoren84151202010-04-18 20:46:11 +00002375 This function uses the MacOSX framework SystemConfiguration
2376 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002377 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002378 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002379
Ronald Oussoren84151202010-04-18 20:46:11 +00002380
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002381
2382 def proxy_bypass(host):
2383 if getproxies_environment():
2384 return proxy_bypass_environment(host)
2385 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002386 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002387
2388 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002389 return getproxies_environment() or getproxies_macosx_sysconf()
2390
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002391
2392elif os.name == 'nt':
2393 def getproxies_registry():
2394 """Return a dictionary of scheme -> proxy server URL mappings.
2395
2396 Win32 uses the registry to store proxies.
2397
2398 """
2399 proxies = {}
2400 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002401 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002402 except ImportError:
2403 # Std module, so should be around - but you never know!
2404 return proxies
2405 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002406 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002407 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002408 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002409 'ProxyEnable')[0]
2410 if proxyEnable:
2411 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002412 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002413 'ProxyServer')[0])
2414 if '=' in proxyServer:
2415 # Per-protocol settings
2416 for p in proxyServer.split(';'):
2417 protocol, address = p.split('=', 1)
2418 # See if address has a type:// prefix
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002419 if not re.match('^([^/:]+)://', address):
2420 address = '%s://%s' % (protocol, address)
2421 proxies[protocol] = address
2422 else:
2423 # Use one setting for all protocols
2424 if proxyServer[:5] == 'http:':
2425 proxies['http'] = proxyServer
2426 else:
2427 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002428 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002429 proxies['ftp'] = 'ftp://%s' % proxyServer
2430 internetSettings.Close()
2431 except (WindowsError, ValueError, TypeError):
2432 # Either registry key not found etc, or the value in an
2433 # unexpected format.
2434 # proxies already set up to be empty so nothing to do
2435 pass
2436 return proxies
2437
2438 def getproxies():
2439 """Return a dictionary of scheme -> proxy server URL mappings.
2440
2441 Returns settings gathered from the environment, if specified,
2442 or the registry.
2443
2444 """
2445 return getproxies_environment() or getproxies_registry()
2446
2447 def proxy_bypass_registry(host):
2448 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002449 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002450 except ImportError:
2451 # Std modules, so should be around - but you never know!
2452 return 0
2453 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002454 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002455 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002456 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002457 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002458 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002459 'ProxyOverride')[0])
2460 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2461 except WindowsError:
2462 return 0
2463 if not proxyEnable or not proxyOverride:
2464 return 0
2465 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002466 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002467 host = [rawHost]
2468 try:
2469 addr = socket.gethostbyname(rawHost)
2470 if addr != rawHost:
2471 host.append(addr)
2472 except socket.error:
2473 pass
2474 try:
2475 fqdn = socket.getfqdn(rawHost)
2476 if fqdn != rawHost:
2477 host.append(fqdn)
2478 except socket.error:
2479 pass
2480 # make a check value list from the registry entry: replace the
2481 # '<local>' string by the localhost entry and the corresponding
2482 # canonical entry.
2483 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002484 # now check if we match one of the registry values.
2485 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002486 if test == '<local>':
2487 if '.' not in rawHost:
2488 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002489 test = test.replace(".", r"\.") # mask dots
2490 test = test.replace("*", r".*") # change glob sequence
2491 test = test.replace("?", r".") # change glob char
2492 for val in host:
2493 # print "%s <--> %s" %( test, val )
2494 if re.match(test, val, re.I):
2495 return 1
2496 return 0
2497
2498 def proxy_bypass(host):
2499 """Return a dictionary of scheme -> proxy server URL mappings.
2500
2501 Returns settings gathered from the environment, if specified,
2502 or the registry.
2503
2504 """
2505 if getproxies_environment():
2506 return proxy_bypass_environment(host)
2507 else:
2508 return proxy_bypass_registry(host)
2509
2510else:
2511 # By default use environment variables
2512 getproxies = getproxies_environment
2513 proxy_bypass = proxy_bypass_environment