blob: 97fd5440a2e1747d782719c73b2607af25092849 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
Jeremy Hylton1afc1692008-06-18 20:49:58 +000092import re
93import socket
94import sys
95import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000096import collections
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097
Georg Brandl13e89462008-07-01 19:56:00 +000098from urllib.error import URLError, HTTPError, ContentTooShortError
99from urllib.parse import (
100 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
101 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000102 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000103from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000104
105# check for SSL
106try:
107 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000108except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000109 _have_ssl = False
110else:
111 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000112
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800113__all__ = [
114 # Classes
115 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
116 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
117 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
118 'AbstractBasicAuthHandler', 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler',
119 'AbstractDigestAuthHandler', 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler',
120 'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler',
121 'UnknownHandler', 'HTTPErrorProcessor',
122 # Functions
123 'urlopen', 'install_opener', 'build_opener',
124 'pathname2url', 'url2pathname', 'getproxies',
125 # Legacy interface
126 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
127]
128
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000129# used in User-Agent header sent
130__version__ = sys.version[:3]
131
132_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000133def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
134 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000135 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000136 if cafile or capath:
137 if not _have_ssl:
138 raise ValueError('SSL support not available')
139 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
140 context.options |= ssl.OP_NO_SSLv2
141 if cafile or capath:
142 context.verify_mode = ssl.CERT_REQUIRED
143 context.load_verify_locations(cafile, capath)
144 check_hostname = True
145 else:
146 check_hostname = False
147 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
148 opener = build_opener(https_handler)
149 elif _opener is None:
150 _opener = opener = build_opener()
151 else:
152 opener = _opener
153 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000154
155def install_opener(opener):
156 global _opener
157 _opener = opener
158
159# TODO(jhylton): Make this work with the same global opener.
160_urlopener = None
161def urlretrieve(url, filename=None, reporthook=None, data=None):
162 global _urlopener
163 if not _urlopener:
164 _urlopener = FancyURLopener()
165 return _urlopener.retrieve(url, filename, reporthook, data)
166
167def urlcleanup():
168 if _urlopener:
169 _urlopener.cleanup()
170 global _opener
171 if _opener:
172 _opener = None
173
174# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000175_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000176def request_host(request):
177 """Return request-host, as defined by RFC 2965.
178
179 Variation from RFC: returned value is lowercased, for convenient
180 comparison.
181
182 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000183 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000184 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185 if host == "":
186 host = request.get_header("Host", "")
187
188 # remove port, if present
189 host = _cut_port_re.sub("", host, 1)
190 return host.lower()
191
192class Request:
193
194 def __init__(self, url, data=None, headers={},
Senthil Kumarande49d642011-10-16 23:54:44 +0800195 origin_req_host=None, unverifiable=False,
196 method=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000197 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000198 self.full_url = unwrap(url)
Senthil Kumaran26430412011-04-13 07:01:19 +0800199 self.full_url, self.fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000200 self.data = data
201 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000202 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000203 for key, value in headers.items():
204 self.add_header(key, value)
205 self.unredirected_hdrs = {}
206 if origin_req_host is None:
207 origin_req_host = request_host(self)
208 self.origin_req_host = origin_req_host
209 self.unverifiable = unverifiable
Senthil Kumarande49d642011-10-16 23:54:44 +0800210 self.method = method
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000211 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000212
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000213 def _parse(self):
214 self.type, rest = splittype(self.full_url)
215 if self.type is None:
216 raise ValueError("unknown url type: %s" % self.full_url)
217 self.host, self.selector = splithost(rest)
218 if self.host:
219 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000220
221 def get_method(self):
Senthil Kumarande49d642011-10-16 23:54:44 +0800222 """Return a string indicating the HTTP request method."""
223 if self.method is not None:
224 return self.method
225 elif self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000226 return "POST"
227 else:
228 return "GET"
229
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000230 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000231
232 def add_data(self, data):
233 self.data = data
234
235 def has_data(self):
236 return self.data is not None
237
238 def get_data(self):
239 return self.data
240
241 def get_full_url(self):
Senthil Kumaran26430412011-04-13 07:01:19 +0800242 if self.fragment:
243 return '%s#%s' % (self.full_url, self.fragment)
244 else:
245 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000246
247 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000248 return self.type
249
250 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000251 return self.host
252
253 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000254 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000255
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000256 def is_unverifiable(self):
257 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000258
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000259 def get_origin_req_host(self):
260 return self.origin_req_host
261
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000262 # End deprecated methods
263
264 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000265 if self.type == 'https' and not self._tunnel_host:
266 self._tunnel_host = self.host
267 else:
268 self.type= type
269 self.selector = self.full_url
270 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000271
272 def has_proxy(self):
273 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000274
275 def add_header(self, key, val):
276 # useful for something like authentication
277 self.headers[key.capitalize()] = val
278
279 def add_unredirected_header(self, key, val):
280 # will not be added to a redirected request
281 self.unredirected_hdrs[key.capitalize()] = val
282
283 def has_header(self, header_name):
284 return (header_name in self.headers or
285 header_name in self.unredirected_hdrs)
286
287 def get_header(self, header_name, default=None):
288 return self.headers.get(
289 header_name,
290 self.unredirected_hdrs.get(header_name, default))
291
292 def header_items(self):
293 hdrs = self.unredirected_hdrs.copy()
294 hdrs.update(self.headers)
295 return list(hdrs.items())
296
297class OpenerDirector:
298 def __init__(self):
299 client_version = "Python-urllib/%s" % __version__
300 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000301 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000302 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000303 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000304 self.handle_open = {}
305 self.handle_error = {}
306 self.process_response = {}
307 self.process_request = {}
308
309 def add_handler(self, handler):
310 if not hasattr(handler, "add_parent"):
311 raise TypeError("expected BaseHandler instance, got %r" %
312 type(handler))
313
314 added = False
315 for meth in dir(handler):
316 if meth in ["redirect_request", "do_open", "proxy_open"]:
317 # oops, coincidental match
318 continue
319
320 i = meth.find("_")
321 protocol = meth[:i]
322 condition = meth[i+1:]
323
324 if condition.startswith("error"):
325 j = condition.find("_") + i + 1
326 kind = meth[j+1:]
327 try:
328 kind = int(kind)
329 except ValueError:
330 pass
331 lookup = self.handle_error.get(protocol, {})
332 self.handle_error[protocol] = lookup
333 elif condition == "open":
334 kind = protocol
335 lookup = self.handle_open
336 elif condition == "response":
337 kind = protocol
338 lookup = self.process_response
339 elif condition == "request":
340 kind = protocol
341 lookup = self.process_request
342 else:
343 continue
344
345 handlers = lookup.setdefault(kind, [])
346 if handlers:
347 bisect.insort(handlers, handler)
348 else:
349 handlers.append(handler)
350 added = True
351
352 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000353 bisect.insort(self.handlers, handler)
354 handler.add_parent(self)
355
356 def close(self):
357 # Only exists for backwards compatibility.
358 pass
359
360 def _call_chain(self, chain, kind, meth_name, *args):
361 # Handlers raise an exception if no one else should try to handle
362 # the request, or return None if they can't but another handler
363 # could. Otherwise, they return the response.
364 handlers = chain.get(kind, ())
365 for handler in handlers:
366 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000367 result = func(*args)
368 if result is not None:
369 return result
370
371 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
372 # accept a URL or a Request object
373 if isinstance(fullurl, str):
374 req = Request(fullurl, data)
375 else:
376 req = fullurl
377 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000378 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000379
380 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000381 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000382
383 # pre-process request
384 meth_name = protocol+"_request"
385 for processor in self.process_request.get(protocol, []):
386 meth = getattr(processor, meth_name)
387 req = meth(req)
388
389 response = self._open(req, data)
390
391 # post-process response
392 meth_name = protocol+"_response"
393 for processor in self.process_response.get(protocol, []):
394 meth = getattr(processor, meth_name)
395 response = meth(req, response)
396
397 return response
398
399 def _open(self, req, data=None):
400 result = self._call_chain(self.handle_open, 'default',
401 'default_open', req)
402 if result:
403 return result
404
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000405 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000406 result = self._call_chain(self.handle_open, protocol, protocol +
407 '_open', req)
408 if result:
409 return result
410
411 return self._call_chain(self.handle_open, 'unknown',
412 'unknown_open', req)
413
414 def error(self, proto, *args):
415 if proto in ('http', 'https'):
416 # XXX http[s] protocols are special-cased
417 dict = self.handle_error['http'] # https is not different than http
418 proto = args[2] # YUCK!
419 meth_name = 'http_error_%s' % proto
420 http_err = 1
421 orig_args = args
422 else:
423 dict = self.handle_error
424 meth_name = proto + '_error'
425 http_err = 0
426 args = (dict, proto, meth_name) + args
427 result = self._call_chain(*args)
428 if result:
429 return result
430
431 if http_err:
432 args = (dict, 'default', 'http_error_default') + orig_args
433 return self._call_chain(*args)
434
435# XXX probably also want an abstract factory that knows when it makes
436# sense to skip a superclass in favor of a subclass and when it might
437# make sense to include both
438
439def build_opener(*handlers):
440 """Create an opener object from a list of handlers.
441
442 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000443 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000444
445 If any of the handlers passed as arguments are subclasses of the
446 default handlers, the default handlers will not be used.
447 """
448 def isclass(obj):
449 return isinstance(obj, type) or hasattr(obj, "__bases__")
450
451 opener = OpenerDirector()
452 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
453 HTTPDefaultErrorHandler, HTTPRedirectHandler,
454 FTPHandler, FileHandler, HTTPErrorProcessor]
455 if hasattr(http.client, "HTTPSConnection"):
456 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000457 skip = set()
458 for klass in default_classes:
459 for check in handlers:
460 if isclass(check):
461 if issubclass(check, klass):
462 skip.add(klass)
463 elif isinstance(check, klass):
464 skip.add(klass)
465 for klass in skip:
466 default_classes.remove(klass)
467
468 for klass in default_classes:
469 opener.add_handler(klass())
470
471 for h in handlers:
472 if isclass(h):
473 h = h()
474 opener.add_handler(h)
475 return opener
476
477class BaseHandler:
478 handler_order = 500
479
480 def add_parent(self, parent):
481 self.parent = parent
482
483 def close(self):
484 # Only exists for backwards compatibility
485 pass
486
487 def __lt__(self, other):
488 if not hasattr(other, "handler_order"):
489 # Try to preserve the old behavior of having custom classes
490 # inserted after default ones (works only for custom user
491 # classes which are not aware of handler_order).
492 return True
493 return self.handler_order < other.handler_order
494
495
496class HTTPErrorProcessor(BaseHandler):
497 """Process HTTP error responses."""
498 handler_order = 1000 # after all other processing
499
500 def http_response(self, request, response):
501 code, msg, hdrs = response.code, response.msg, response.info()
502
503 # According to RFC 2616, "2xx" code indicates that the client's
504 # request was successfully received, understood, and accepted.
505 if not (200 <= code < 300):
506 response = self.parent.error(
507 'http', request, response, code, msg, hdrs)
508
509 return response
510
511 https_response = http_response
512
513class HTTPDefaultErrorHandler(BaseHandler):
514 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516
517class HTTPRedirectHandler(BaseHandler):
518 # maximum number of redirections to any single URL
519 # this is needed because of the state that cookies introduce
520 max_repeats = 4
521 # maximum total number of redirections (regardless of URL) before
522 # assuming we're in a loop
523 max_redirections = 10
524
525 def redirect_request(self, req, fp, code, msg, headers, newurl):
526 """Return a Request or None in response to a redirect.
527
528 This is called by the http_error_30x methods when a
529 redirection response is received. If a redirection should
530 take place, return a new Request to allow http_error_30x to
531 perform the redirect. Otherwise, raise HTTPError if no-one
532 else should try to handle this url. Return None if you can't
533 but another Handler might.
534 """
535 m = req.get_method()
536 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
537 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000538 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000539
540 # Strictly (according to RFC 2616), 301 or 302 in response to
541 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000542 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000543 # essentially all clients do redirect in this case, so we do
544 # the same.
545 # be conciliant with URIs containing a space
546 newurl = newurl.replace(' ', '%20')
547 CONTENT_HEADERS = ("content-length", "content-type")
548 newheaders = dict((k, v) for k, v in req.headers.items()
549 if k.lower() not in CONTENT_HEADERS)
550 return Request(newurl,
551 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000552 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000553 unverifiable=True)
554
555 # Implementation note: To avoid the server sending us into an
556 # infinite loop, the request object needs to track what URLs we
557 # have already seen. Do this by adding a handler-specific
558 # attribute to the Request object.
559 def http_error_302(self, req, fp, code, msg, headers):
560 # Some servers (incorrectly) return multiple Location headers
561 # (so probably same goes for URI). Use first header.
562 if "location" in headers:
563 newurl = headers["location"]
564 elif "uri" in headers:
565 newurl = headers["uri"]
566 else:
567 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000568
569 # fix a possible malformed URL
570 urlparts = urlparse(newurl)
guido@google.coma119df92011-03-29 11:41:02 -0700571
572 # For security reasons we don't allow redirection to anything other
573 # than http, https or ftp.
574
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800575 if urlparts.scheme not in ('http', 'https', 'ftp'):
576 raise HTTPError(
577 newurl, code,
578 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
579 headers, fp)
guido@google.coma119df92011-03-29 11:41:02 -0700580
Facundo Batistaf24802c2008-08-17 03:36:03 +0000581 if not urlparts.path:
582 urlparts = list(urlparts)
583 urlparts[2] = "/"
584 newurl = urlunparse(urlparts)
585
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000586 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000587
588 # XXX Probably want to forget about the state of the current
589 # request, although that might interact poorly with other
590 # handlers that also use handler-specific request attributes
591 new = self.redirect_request(req, fp, code, msg, headers, newurl)
592 if new is None:
593 return
594
595 # loop detection
596 # .redirect_dict has a key url if url was previously visited.
597 if hasattr(req, 'redirect_dict'):
598 visited = new.redirect_dict = req.redirect_dict
599 if (visited.get(newurl, 0) >= self.max_repeats or
600 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000601 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000602 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000603 else:
604 visited = new.redirect_dict = req.redirect_dict = {}
605 visited[newurl] = visited.get(newurl, 0) + 1
606
607 # Don't close the fp until we are sure that we won't use it
608 # with HTTPError.
609 fp.read()
610 fp.close()
611
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000612 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000613
614 http_error_301 = http_error_303 = http_error_307 = http_error_302
615
616 inf_msg = "The HTTP server returned a redirect error that would " \
617 "lead to an infinite loop.\n" \
618 "The last 30x error message was:\n"
619
620
621def _parse_proxy(proxy):
622 """Return (scheme, user, password, host/port) given a URL or an authority.
623
624 If a URL is supplied, it must have an authority (host:port) component.
625 According to RFC 3986, having an authority component means the URL must
626 have two slashes after the scheme:
627
628 >>> _parse_proxy('file:/ftp.example.com/')
629 Traceback (most recent call last):
630 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
631
632 The first three items of the returned tuple may be None.
633
634 Examples of authority parsing:
635
636 >>> _parse_proxy('proxy.example.com')
637 (None, None, None, 'proxy.example.com')
638 >>> _parse_proxy('proxy.example.com:3128')
639 (None, None, None, 'proxy.example.com:3128')
640
641 The authority component may optionally include userinfo (assumed to be
642 username:password):
643
644 >>> _parse_proxy('joe:password@proxy.example.com')
645 (None, 'joe', 'password', 'proxy.example.com')
646 >>> _parse_proxy('joe:password@proxy.example.com:3128')
647 (None, 'joe', 'password', 'proxy.example.com:3128')
648
649 Same examples, but with URLs instead:
650
651 >>> _parse_proxy('http://proxy.example.com/')
652 ('http', None, None, 'proxy.example.com')
653 >>> _parse_proxy('http://proxy.example.com:3128/')
654 ('http', None, None, 'proxy.example.com:3128')
655 >>> _parse_proxy('http://joe:password@proxy.example.com/')
656 ('http', 'joe', 'password', 'proxy.example.com')
657 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
658 ('http', 'joe', 'password', 'proxy.example.com:3128')
659
660 Everything after the authority is ignored:
661
662 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
663 ('ftp', 'joe', 'password', 'proxy.example.com')
664
665 Test for no trailing '/' case:
666
667 >>> _parse_proxy('http://joe:password@proxy.example.com')
668 ('http', 'joe', 'password', 'proxy.example.com')
669
670 """
Georg Brandl13e89462008-07-01 19:56:00 +0000671 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000672 if not r_scheme.startswith("/"):
673 # authority
674 scheme = None
675 authority = proxy
676 else:
677 # URL
678 if not r_scheme.startswith("//"):
679 raise ValueError("proxy URL with no authority: %r" % proxy)
680 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
681 # and 3.3.), path is empty or starts with '/'
682 end = r_scheme.find("/", 2)
683 if end == -1:
684 end = None
685 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000686 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000687 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000688 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000689 else:
690 user = password = None
691 return scheme, user, password, hostport
692
693class ProxyHandler(BaseHandler):
694 # Proxies must be in front
695 handler_order = 100
696
697 def __init__(self, proxies=None):
698 if proxies is None:
699 proxies = getproxies()
700 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
701 self.proxies = proxies
702 for type, url in proxies.items():
703 setattr(self, '%s_open' % type,
704 lambda r, proxy=url, type=type, meth=self.proxy_open: \
705 meth(r, proxy, type))
706
707 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000708 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000709 proxy_type, user, password, hostport = _parse_proxy(proxy)
710 if proxy_type is None:
711 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000712
713 if req.host and proxy_bypass(req.host):
714 return None
715
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000716 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000717 user_pass = '%s:%s' % (unquote(user),
718 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000719 creds = base64.b64encode(user_pass.encode()).decode("ascii")
720 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000721 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000722 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000723 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000724 # let other handlers take care of it
725 return None
726 else:
727 # need to start over, because the other handlers don't
728 # grok the proxy's URL type
729 # e.g. if we have a constructor arg proxies like so:
730 # {'http': 'ftp://proxy.example.com'}, we may end up turning
731 # a request for http://acme.example.com/a into one for
732 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000733 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000734
735class HTTPPasswordMgr:
736
737 def __init__(self):
738 self.passwd = {}
739
740 def add_password(self, realm, uri, user, passwd):
741 # uri could be a single URI or a sequence
742 if isinstance(uri, str):
743 uri = [uri]
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800744 if realm not in self.passwd:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000745 self.passwd[realm] = {}
746 for default_port in True, False:
747 reduced_uri = tuple(
748 [self.reduce_uri(u, default_port) for u in uri])
749 self.passwd[realm][reduced_uri] = (user, passwd)
750
751 def find_user_password(self, realm, authuri):
752 domains = self.passwd.get(realm, {})
753 for default_port in True, False:
754 reduced_authuri = self.reduce_uri(authuri, default_port)
755 for uris, authinfo in domains.items():
756 for uri in uris:
757 if self.is_suburi(uri, reduced_authuri):
758 return authinfo
759 return None, None
760
761 def reduce_uri(self, uri, default_port=True):
762 """Accept authority or URI and extract only the authority and path."""
763 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000764 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000765 if parts[1]:
766 # URI
767 scheme = parts[0]
768 authority = parts[1]
769 path = parts[2] or '/'
770 else:
771 # host or host:port
772 scheme = None
773 authority = uri
774 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000775 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000776 if default_port and port is None and scheme is not None:
777 dport = {"http": 80,
778 "https": 443,
779 }.get(scheme)
780 if dport is not None:
781 authority = "%s:%d" % (host, dport)
782 return authority, path
783
784 def is_suburi(self, base, test):
785 """Check if test is below base in a URI tree
786
787 Both args must be URIs in reduced form.
788 """
789 if base == test:
790 return True
791 if base[0] != test[0]:
792 return False
793 common = posixpath.commonprefix((base[1], test[1]))
794 if len(common) == len(base[1]):
795 return True
796 return False
797
798
799class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
800
801 def find_user_password(self, realm, authuri):
802 user, password = HTTPPasswordMgr.find_user_password(self, realm,
803 authuri)
804 if user is not None:
805 return user, password
806 return HTTPPasswordMgr.find_user_password(self, None, authuri)
807
808
809class AbstractBasicAuthHandler:
810
811 # XXX this allows for multiple auth-schemes, but will stupidly pick
812 # the last one with a realm specified.
813
814 # allow for double- and single-quoted realm values
815 # (single quotes are a violation of the RFC, but appear in the wild)
816 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
817 'realm=(["\'])(.*?)\\2', re.I)
818
819 # XXX could pre-emptively send auth info already accepted (RFC 2617,
820 # end of section 2, and section 1.2 immediately after "credentials"
821 # production).
822
823 def __init__(self, password_mgr=None):
824 if password_mgr is None:
825 password_mgr = HTTPPasswordMgr()
826 self.passwd = password_mgr
827 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000828 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000829
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000830 def reset_retry_count(self):
831 self.retried = 0
832
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000833 def http_error_auth_reqed(self, authreq, host, req, headers):
834 # host may be an authority (without userinfo) or a URL with an
835 # authority
836 # XXX could be multiple headers
837 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000838
839 if self.retried > 5:
840 # retry sending the username:password 5 times before failing.
841 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
842 headers, None)
843 else:
844 self.retried += 1
845
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000846 if authreq:
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800847 scheme = authreq.split()[0]
Senthil Kumaran1a129c82011-10-20 02:50:13 +0800848 if scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800849 raise ValueError("AbstractBasicAuthHandler does not"
850 " support the following scheme: '%s'" %
851 scheme)
852 else:
853 mo = AbstractBasicAuthHandler.rx.search(authreq)
854 if mo:
855 scheme, quote, realm = mo.groups()
856 if scheme.lower() == 'basic':
857 response = self.retry_http_basic_auth(host, req, realm)
858 if response and response.code != 401:
859 self.retried = 0
860 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000861
862 def retry_http_basic_auth(self, host, req, realm):
863 user, pw = self.passwd.find_user_password(realm, host)
864 if pw is not None:
865 raw = "%s:%s" % (user, pw)
866 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
867 if req.headers.get(self.auth_header, None) == auth:
868 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000869 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000870 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000871 else:
872 return None
873
874
875class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
876
877 auth_header = 'Authorization'
878
879 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000880 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000881 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000882 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000883 self.reset_retry_count()
884 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000885
886
887class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
888
889 auth_header = 'Proxy-authorization'
890
891 def http_error_407(self, req, fp, code, msg, headers):
892 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000893 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000894 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
895 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000896 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000897 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000898 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000899 self.reset_retry_count()
900 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901
902
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800903# Return n random bytes.
904_randombytes = os.urandom
905
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000906
907class AbstractDigestAuthHandler:
908 # Digest authentication is specified in RFC 2617.
909
910 # XXX The client does not inspect the Authentication-Info header
911 # in a successful response.
912
913 # XXX It should be possible to test this implementation against
914 # a mock server that just generates a static set of challenges.
915
916 # XXX qop="auth-int" supports is shaky
917
918 def __init__(self, passwd=None):
919 if passwd is None:
920 passwd = HTTPPasswordMgr()
921 self.passwd = passwd
922 self.add_password = self.passwd.add_password
923 self.retried = 0
924 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000925 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000926
927 def reset_retry_count(self):
928 self.retried = 0
929
930 def http_error_auth_reqed(self, auth_header, host, req, headers):
931 authreq = headers.get(auth_header, None)
932 if self.retried > 5:
933 # Don't fail endlessly - if we failed once, we'll probably
934 # fail a second time. Hm. Unless the Password Manager is
935 # prompting for the information. Crap. This isn't great
936 # but it's better than the current 'repeat until recursion
937 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000938 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000939 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000940 else:
941 self.retried += 1
942 if authreq:
943 scheme = authreq.split()[0]
944 if scheme.lower() == 'digest':
945 return self.retry_http_digest_auth(req, authreq)
Senthil Kumaran1a129c82011-10-20 02:50:13 +0800946 elif scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800947 raise ValueError("AbstractDigestAuthHandler does not support"
948 " the following scheme: '%s'" % scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000949
950 def retry_http_digest_auth(self, req, auth):
951 token, challenge = auth.split(' ', 1)
952 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
953 auth = self.get_authorization(req, chal)
954 if auth:
955 auth_val = 'Digest %s' % auth
956 if req.headers.get(self.auth_header, None) == auth_val:
957 return None
958 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000959 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000960 return resp
961
962 def get_cnonce(self, nonce):
963 # The cnonce-value is an opaque
964 # quoted string value provided by the client and used by both client
965 # and server to avoid chosen plaintext attacks, to provide mutual
966 # authentication, and to provide some message integrity protection.
967 # This isn't a fabulous effort, but it's probably Good Enough.
968 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
Senthil Kumaran6c5bd402011-11-01 23:20:31 +0800969 b = s.encode("ascii") + _randombytes(8)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000970 dig = hashlib.sha1(b).hexdigest()
971 return dig[:16]
972
973 def get_authorization(self, req, chal):
974 try:
975 realm = chal['realm']
976 nonce = chal['nonce']
977 qop = chal.get('qop')
978 algorithm = chal.get('algorithm', 'MD5')
979 # mod_digest doesn't send an opaque, even though it isn't
980 # supposed to be optional
981 opaque = chal.get('opaque', None)
982 except KeyError:
983 return None
984
985 H, KD = self.get_algorithm_impls(algorithm)
986 if H is None:
987 return None
988
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000989 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000990 if user is None:
991 return None
992
993 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000994 if req.data is not None:
995 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000996 else:
997 entdig = None
998
999 A1 = "%s:%s:%s" % (user, realm, pw)
1000 A2 = "%s:%s" % (req.get_method(),
1001 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001002 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001003 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +00001004 if nonce == self.last_nonce:
1005 self.nonce_count += 1
1006 else:
1007 self.nonce_count = 1
1008 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001009 ncvalue = '%08x' % self.nonce_count
1010 cnonce = self.get_cnonce(nonce)
1011 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1012 respdig = KD(H(A1), noncebit)
1013 elif qop is None:
1014 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1015 else:
1016 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +00001017 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018
1019 # XXX should the partial digests be encoded too?
1020
1021 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001022 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001023 respdig)
1024 if opaque:
1025 base += ', opaque="%s"' % opaque
1026 if entdig:
1027 base += ', digest="%s"' % entdig
1028 base += ', algorithm="%s"' % algorithm
1029 if qop:
1030 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1031 return base
1032
1033 def get_algorithm_impls(self, algorithm):
1034 # lambdas assume digest modules are imported at the top level
1035 if algorithm == 'MD5':
1036 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1037 elif algorithm == 'SHA':
1038 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1039 # XXX MD5-sess
1040 KD = lambda s, d: H("%s:%s" % (s, d))
1041 return H, KD
1042
1043 def get_entity_digest(self, data, chal):
1044 # XXX not implemented yet
1045 return None
1046
1047
1048class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1049 """An authentication protocol defined by RFC 2069
1050
1051 Digest authentication improves on basic authentication because it
1052 does not transmit passwords in the clear.
1053 """
1054
1055 auth_header = 'Authorization'
1056 handler_order = 490 # before Basic auth
1057
1058 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001059 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001060 retry = self.http_error_auth_reqed('www-authenticate',
1061 host, req, headers)
1062 self.reset_retry_count()
1063 return retry
1064
1065
1066class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1067
1068 auth_header = 'Proxy-Authorization'
1069 handler_order = 490 # before Basic auth
1070
1071 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001072 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001073 retry = self.http_error_auth_reqed('proxy-authenticate',
1074 host, req, headers)
1075 self.reset_retry_count()
1076 return retry
1077
1078class AbstractHTTPHandler(BaseHandler):
1079
1080 def __init__(self, debuglevel=0):
1081 self._debuglevel = debuglevel
1082
1083 def set_http_debuglevel(self, level):
1084 self._debuglevel = level
1085
1086 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001087 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001088 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001089 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001090
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001091 if request.data is not None: # POST
1092 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001093 if isinstance(data, str):
1094 raise TypeError("POST data should be bytes"
1095 " or an iterable of bytes. It cannot be str.")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001096 if not request.has_header('Content-type'):
1097 request.add_unredirected_header(
1098 'Content-type',
1099 'application/x-www-form-urlencoded')
1100 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001101 try:
1102 mv = memoryview(data)
1103 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001104 if isinstance(data, collections.Iterable):
Georg Brandl61536042011-02-03 07:46:41 +00001105 raise ValueError("Content-Length should be specified "
1106 "for iterable data of type %r %r" % (type(data),
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001107 data))
1108 else:
1109 request.add_unredirected_header(
Senthil Kumaran1e991f22010-12-24 04:03:59 +00001110 'Content-length', '%d' % (len(mv) * mv.itemsize))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001111
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001112 sel_host = host
1113 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001114 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001115 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001116 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001117 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001118 for name, value in self.parent.addheaders:
1119 name = name.capitalize()
1120 if not request.has_header(name):
1121 request.add_unredirected_header(name, value)
1122
1123 return request
1124
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001125 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001126 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001127
1128 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001129 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001130 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001131 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001132 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001133
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001134 # will parse host:port
1135 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001136
1137 headers = dict(req.unredirected_hdrs)
1138 headers.update(dict((k, v) for k, v in req.headers.items()
1139 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001140
1141 # TODO(jhylton): Should this be redesigned to handle
1142 # persistent connections?
1143
1144 # We want to make an HTTP/1.1 request, but the addinfourl
1145 # class isn't prepared to deal with a persistent connection.
1146 # It will try to read all remaining data from the socket,
1147 # which will block while the server waits for the next request.
1148 # So make sure the connection gets closed after the (only)
1149 # request.
1150 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001151 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001152
1153 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001154 tunnel_headers = {}
1155 proxy_auth_hdr = "Proxy-Authorization"
1156 if proxy_auth_hdr in headers:
1157 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1158 # Proxy-Authorization should not be sent to origin
1159 # server.
1160 del headers[proxy_auth_hdr]
1161 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001162
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001163 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001164 h.request(req.get_method(), req.selector, req.data, headers)
Senthil Kumaran1299a8f2011-07-27 08:05:58 +08001165 except socket.error as err: # timeout error
Senthil Kumaran45686b42011-07-27 09:31:03 +08001166 h.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001167 raise URLError(err)
Senthil Kumaran45686b42011-07-27 09:31:03 +08001168 else:
1169 r = h.getresponse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001170
Senthil Kumaran26430412011-04-13 07:01:19 +08001171 r.url = req.get_full_url()
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001172 # This line replaces the .msg attribute of the HTTPResponse
1173 # with .headers, because urllib clients expect the response to
1174 # have the reason in .msg. It would be good to mark this
1175 # attribute is deprecated and get then to use info() or
1176 # .headers.
1177 r.msg = r.reason
1178 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001179
1180
1181class HTTPHandler(AbstractHTTPHandler):
1182
1183 def http_open(self, req):
1184 return self.do_open(http.client.HTTPConnection, req)
1185
1186 http_request = AbstractHTTPHandler.do_request_
1187
1188if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001189
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001190 class HTTPSHandler(AbstractHTTPHandler):
1191
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001192 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1193 AbstractHTTPHandler.__init__(self, debuglevel)
1194 self._context = context
1195 self._check_hostname = check_hostname
1196
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001197 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001198 return self.do_open(http.client.HTTPSConnection, req,
1199 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001200
1201 https_request = AbstractHTTPHandler.do_request_
1202
1203class HTTPCookieProcessor(BaseHandler):
1204 def __init__(self, cookiejar=None):
1205 import http.cookiejar
1206 if cookiejar is None:
1207 cookiejar = http.cookiejar.CookieJar()
1208 self.cookiejar = cookiejar
1209
1210 def http_request(self, request):
1211 self.cookiejar.add_cookie_header(request)
1212 return request
1213
1214 def http_response(self, request, response):
1215 self.cookiejar.extract_cookies(response, request)
1216 return response
1217
1218 https_request = http_request
1219 https_response = http_response
1220
1221class UnknownHandler(BaseHandler):
1222 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001223 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001224 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001225
1226def parse_keqv_list(l):
1227 """Parse list of key=value strings where keys are not duplicated."""
1228 parsed = {}
1229 for elt in l:
1230 k, v = elt.split('=', 1)
1231 if v[0] == '"' and v[-1] == '"':
1232 v = v[1:-1]
1233 parsed[k] = v
1234 return parsed
1235
1236def parse_http_list(s):
1237 """Parse lists as described by RFC 2068 Section 2.
1238
1239 In particular, parse comma-separated lists where the elements of
1240 the list may include quoted-strings. A quoted-string could
1241 contain a comma. A non-quoted string could have quotes in the
1242 middle. Neither commas nor quotes count if they are escaped.
1243 Only double-quotes count, not single-quotes.
1244 """
1245 res = []
1246 part = ''
1247
1248 escape = quote = False
1249 for cur in s:
1250 if escape:
1251 part += cur
1252 escape = False
1253 continue
1254 if quote:
1255 if cur == '\\':
1256 escape = True
1257 continue
1258 elif cur == '"':
1259 quote = False
1260 part += cur
1261 continue
1262
1263 if cur == ',':
1264 res.append(part)
1265 part = ''
1266 continue
1267
1268 if cur == '"':
1269 quote = True
1270
1271 part += cur
1272
1273 # append last part
1274 if part:
1275 res.append(part)
1276
1277 return [part.strip() for part in res]
1278
1279class FileHandler(BaseHandler):
1280 # Use local file or FTP depending on form of URL
1281 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001282 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001283 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1284 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001285 if not req.host is self.get_names():
1286 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001287 else:
1288 return self.open_local_file(req)
1289
1290 # names for the localhost
1291 names = None
1292 def get_names(self):
1293 if FileHandler.names is None:
1294 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001295 FileHandler.names = tuple(
1296 socket.gethostbyname_ex('localhost')[2] +
1297 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001298 except socket.gaierror:
1299 FileHandler.names = (socket.gethostbyname('localhost'),)
1300 return FileHandler.names
1301
1302 # not entirely sure what the rules are here
1303 def open_local_file(self, req):
1304 import email.utils
1305 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001306 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001307 filename = req.selector
1308 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001309 try:
1310 stats = os.stat(localfile)
1311 size = stats.st_size
1312 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001313 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001314 headers = email.message_from_string(
1315 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1316 (mtype or 'text/plain', size, modified))
1317 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001318 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001319 if not host or \
1320 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001321 if host:
1322 origurl = 'file://' + host + filename
1323 else:
1324 origurl = 'file://' + filename
1325 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001326 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001327 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001328 raise URLError(msg)
1329 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001330
1331def _safe_gethostbyname(host):
1332 try:
1333 return socket.gethostbyname(host)
1334 except socket.gaierror:
1335 return None
1336
1337class FTPHandler(BaseHandler):
1338 def ftp_open(self, req):
1339 import ftplib
1340 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001341 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001342 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001343 raise URLError('ftp error: no host given')
1344 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001345 if port is None:
1346 port = ftplib.FTP_PORT
1347 else:
1348 port = int(port)
1349
1350 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001351 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001352 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001353 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001354 else:
1355 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001356 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001357 user = user or ''
1358 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001359
1360 try:
1361 host = socket.gethostbyname(host)
1362 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001363 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001364 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001365 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001366 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001367 dirs, file = dirs[:-1], dirs[-1]
1368 if dirs and not dirs[0]:
1369 dirs = dirs[1:]
1370 try:
1371 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1372 type = file and 'I' or 'D'
1373 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001374 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001375 if attr.lower() == 'type' and \
1376 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1377 type = value.upper()
1378 fp, retrlen = fw.retrfile(file, type)
1379 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001380 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001381 if mtype:
1382 headers += "Content-type: %s\n" % mtype
1383 if retrlen is not None and retrlen >= 0:
1384 headers += "Content-length: %d\n" % retrlen
1385 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001386 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001387 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001388 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001389 raise exc.with_traceback(sys.exc_info()[2])
1390
1391 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001392 return ftpwrapper(user, passwd, host, port, dirs, timeout,
1393 persistent=False)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001394
1395class CacheFTPHandler(FTPHandler):
1396 # XXX would be nice to have pluggable cache strategies
1397 # XXX this stuff is definitely not thread safe
1398 def __init__(self):
1399 self.cache = {}
1400 self.timeout = {}
1401 self.soonest = 0
1402 self.delay = 60
1403 self.max_conns = 16
1404
1405 def setTimeout(self, t):
1406 self.delay = t
1407
1408 def setMaxConns(self, m):
1409 self.max_conns = m
1410
1411 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1412 key = user, host, port, '/'.join(dirs), timeout
1413 if key in self.cache:
1414 self.timeout[key] = time.time() + self.delay
1415 else:
1416 self.cache[key] = ftpwrapper(user, passwd, host, port,
1417 dirs, timeout)
1418 self.timeout[key] = time.time() + self.delay
1419 self.check_cache()
1420 return self.cache[key]
1421
1422 def check_cache(self):
1423 # first check for old ones
1424 t = time.time()
1425 if self.soonest <= t:
1426 for k, v in list(self.timeout.items()):
1427 if v < t:
1428 self.cache[k].close()
1429 del self.cache[k]
1430 del self.timeout[k]
1431 self.soonest = min(list(self.timeout.values()))
1432
1433 # then check the size
1434 if len(self.cache) == self.max_conns:
1435 for k, v in list(self.timeout.items()):
1436 if v == self.soonest:
1437 del self.cache[k]
1438 del self.timeout[k]
1439 break
1440 self.soonest = min(list(self.timeout.values()))
1441
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001442 def clear_cache(self):
1443 for conn in self.cache.values():
1444 conn.close()
1445 self.cache.clear()
1446 self.timeout.clear()
1447
1448
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001449# Code move from the old urllib module
1450
1451MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1452
1453# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001454if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001455 from nturl2path import url2pathname, pathname2url
1456else:
1457 def url2pathname(pathname):
1458 """OS-specific conversion from a relative URL of the 'file' scheme
1459 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001460 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001461
1462 def pathname2url(pathname):
1463 """OS-specific conversion from a file system path to a relative URL
1464 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001465 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001466
1467# This really consists of two pieces:
1468# (1) a class which handles opening of all sorts of URLs
1469# (plus assorted utilities etc.)
1470# (2) a set of functions for parsing URLs
1471# XXX Should these be separated out into different modules?
1472
1473
1474ftpcache = {}
1475class URLopener:
1476 """Class to open URLs.
1477 This is a class rather than just a subroutine because we may need
1478 more than one set of global protocol-specific options.
1479 Note -- this is a base class for those who don't want the
1480 automatic handling of errors type 302 (relocated) and 401
1481 (authorization needed)."""
1482
1483 __tempfiles = None
1484
1485 version = "Python-urllib/%s" % __version__
1486
1487 # Constructor
1488 def __init__(self, proxies=None, **x509):
1489 if proxies is None:
1490 proxies = getproxies()
1491 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1492 self.proxies = proxies
1493 self.key_file = x509.get('key_file')
1494 self.cert_file = x509.get('cert_file')
1495 self.addheaders = [('User-Agent', self.version)]
1496 self.__tempfiles = []
1497 self.__unlink = os.unlink # See cleanup()
1498 self.tempcache = None
1499 # Undocumented feature: if you assign {} to tempcache,
1500 # it is used to cache files retrieved with
1501 # self.retrieve(). This is not enabled by default
1502 # since it does not work for changing documents (and I
1503 # haven't got the logic to check expiration headers
1504 # yet).
1505 self.ftpcache = ftpcache
1506 # Undocumented feature: you can use a different
1507 # ftp cache by assigning to the .ftpcache member;
1508 # in case you want logically independent URL openers
1509 # XXX This is not threadsafe. Bah.
1510
1511 def __del__(self):
1512 self.close()
1513
1514 def close(self):
1515 self.cleanup()
1516
1517 def cleanup(self):
1518 # This code sometimes runs when the rest of this module
1519 # has already been deleted, so it can't use any globals
1520 # or import anything.
1521 if self.__tempfiles:
1522 for file in self.__tempfiles:
1523 try:
1524 self.__unlink(file)
1525 except OSError:
1526 pass
1527 del self.__tempfiles[:]
1528 if self.tempcache:
1529 self.tempcache.clear()
1530
1531 def addheader(self, *args):
1532 """Add a header to be used by the HTTP interface only
1533 e.g. u.addheader('Accept', 'sound/basic')"""
1534 self.addheaders.append(args)
1535
1536 # External interface
1537 def open(self, fullurl, data=None):
1538 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001539 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001540 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001541 if self.tempcache and fullurl in self.tempcache:
1542 filename, headers = self.tempcache[fullurl]
1543 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001544 return addinfourl(fp, headers, fullurl)
1545 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001546 if not urltype:
1547 urltype = 'file'
1548 if urltype in self.proxies:
1549 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001550 urltype, proxyhost = splittype(proxy)
1551 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001552 url = (host, fullurl) # Signal special case to open_*()
1553 else:
1554 proxy = None
1555 name = 'open_' + urltype
1556 self.type = urltype
1557 name = name.replace('-', '_')
1558 if not hasattr(self, name):
1559 if proxy:
1560 return self.open_unknown_proxy(proxy, fullurl, data)
1561 else:
1562 return self.open_unknown(fullurl, data)
1563 try:
1564 if data is None:
1565 return getattr(self, name)(url)
1566 else:
1567 return getattr(self, name)(url, data)
Antoine Pitrou6b4883d2011-10-12 02:54:14 +02001568 except HTTPError:
1569 raise
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001570 except socket.error as msg:
1571 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1572
1573 def open_unknown(self, fullurl, data=None):
1574 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001575 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001576 raise IOError('url error', 'unknown url type', type)
1577
1578 def open_unknown_proxy(self, proxy, fullurl, data=None):
1579 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001580 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001581 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1582
1583 # External interface
1584 def retrieve(self, url, filename=None, reporthook=None, data=None):
1585 """retrieve(url) returns (filename, headers) for a local object
1586 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001587 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001588 if self.tempcache and url in self.tempcache:
1589 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001590 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001591 if filename is None and (not type or type == 'file'):
1592 try:
1593 fp = self.open_local_file(url1)
1594 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001595 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001596 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001597 except IOError as msg:
1598 pass
1599 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001600 try:
1601 headers = fp.info()
1602 if filename:
1603 tfp = open(filename, 'wb')
1604 else:
1605 import tempfile
1606 garbage, path = splittype(url)
1607 garbage, path = splithost(path or "")
1608 path, garbage = splitquery(path or "")
1609 path, garbage = splitattr(path or "")
1610 suffix = os.path.splitext(path)[1]
1611 (fd, filename) = tempfile.mkstemp(suffix)
1612 self.__tempfiles.append(filename)
1613 tfp = os.fdopen(fd, 'wb')
1614 try:
1615 result = filename, headers
1616 if self.tempcache is not None:
1617 self.tempcache[url] = result
1618 bs = 1024*8
1619 size = -1
1620 read = 0
1621 blocknum = 0
Senthil Kumarance260142011-11-01 01:35:17 +08001622 if "content-length" in headers:
1623 size = int(headers["Content-Length"])
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001624 if reporthook:
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001625 reporthook(blocknum, bs, size)
1626 while 1:
1627 block = fp.read(bs)
1628 if not block:
1629 break
1630 read += len(block)
1631 tfp.write(block)
1632 blocknum += 1
1633 if reporthook:
1634 reporthook(blocknum, bs, size)
1635 finally:
1636 tfp.close()
1637 finally:
1638 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001639
1640 # raise exception if actual size does not match content-length header
1641 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001642 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001643 "retrieval incomplete: got only %i out of %i bytes"
1644 % (read, size), result)
1645
1646 return result
1647
1648 # Each method named open_<type> knows how to open that type of URL
1649
1650 def _open_generic_http(self, connection_factory, url, data):
1651 """Make an HTTP connection using connection_class.
1652
1653 This is an internal method that should be called from
1654 open_http() or open_https().
1655
1656 Arguments:
1657 - connection_factory should take a host name and return an
1658 HTTPConnection instance.
1659 - url is the url to retrieval or a host, relative-path pair.
1660 - data is payload for a POST request or None.
1661 """
1662
1663 user_passwd = None
1664 proxy_passwd= None
1665 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001666 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001667 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001668 user_passwd, host = splituser(host)
1669 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001670 realhost = host
1671 else:
1672 host, selector = url
1673 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001674 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001675 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001676 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001677 url = rest
1678 user_passwd = None
1679 if urltype.lower() != 'http':
1680 realhost = None
1681 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001682 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001683 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001684 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001685 if user_passwd:
1686 selector = "%s://%s%s" % (urltype, realhost, rest)
1687 if proxy_bypass(realhost):
1688 host = realhost
1689
1690 #print "proxy via http:", host, selector
1691 if not host: raise IOError('http error', 'no host given')
1692
1693 if proxy_passwd:
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001694 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001695 else:
1696 proxy_auth = None
1697
1698 if user_passwd:
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001699 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001700 else:
1701 auth = None
1702 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001703 headers = {}
1704 if proxy_auth:
1705 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1706 if auth:
1707 headers["Authorization"] = "Basic %s" % auth
1708 if realhost:
1709 headers["Host"] = realhost
Senthil Kumarand91ffca2011-03-19 17:25:27 +08001710
1711 # Add Connection:close as we don't support persistent connections yet.
1712 # This helps in closing the socket and avoiding ResourceWarning
1713
1714 headers["Connection"] = "close"
1715
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001716 for header, value in self.addheaders:
1717 headers[header] = value
1718
1719 if data is not None:
1720 headers["Content-Type"] = "application/x-www-form-urlencoded"
1721 http_conn.request("POST", selector, data, headers)
1722 else:
1723 http_conn.request("GET", selector, headers=headers)
1724
1725 try:
1726 response = http_conn.getresponse()
1727 except http.client.BadStatusLine:
1728 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001729 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001730
1731 # According to RFC 2616, "2xx" code indicates that the client's
1732 # request was successfully received, understood, and accepted.
1733 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001734 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001735 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001736 else:
1737 return self.http_error(
1738 url, response.fp,
1739 response.status, response.reason, response.msg, data)
1740
1741 def open_http(self, url, data=None):
1742 """Use HTTP protocol."""
1743 return self._open_generic_http(http.client.HTTPConnection, url, data)
1744
1745 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1746 """Handle http errors.
1747
1748 Derived class can override this, or provide specific handlers
1749 named http_error_DDD where DDD is the 3-digit error code."""
1750 # First check if there's a specific handler for this error
1751 name = 'http_error_%d' % errcode
1752 if hasattr(self, name):
1753 method = getattr(self, name)
1754 if data is None:
1755 result = method(url, fp, errcode, errmsg, headers)
1756 else:
1757 result = method(url, fp, errcode, errmsg, headers, data)
1758 if result: return result
1759 return self.http_error_default(url, fp, errcode, errmsg, headers)
1760
1761 def http_error_default(self, url, fp, errcode, errmsg, headers):
1762 """Default error handler: close the connection and raise IOError."""
1763 void = fp.read()
1764 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001765 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001766
1767 if _have_ssl:
1768 def _https_connection(self, host):
1769 return http.client.HTTPSConnection(host,
1770 key_file=self.key_file,
1771 cert_file=self.cert_file)
1772
1773 def open_https(self, url, data=None):
1774 """Use HTTPS protocol."""
1775 return self._open_generic_http(self._https_connection, url, data)
1776
1777 def open_file(self, url):
1778 """Use local file or FTP depending on form of URL."""
1779 if not isinstance(url, str):
1780 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1781 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001782 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001783 else:
1784 return self.open_local_file(url)
1785
1786 def open_local_file(self, url):
1787 """Use local file."""
Senthil Kumaran6c5bd402011-11-01 23:20:31 +08001788 import email.utils
1789 import mimetypes
Georg Brandl13e89462008-07-01 19:56:00 +00001790 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001791 localname = url2pathname(file)
1792 try:
1793 stats = os.stat(localname)
1794 except OSError as e:
1795 raise URLError(e.errno, e.strerror, e.filename)
1796 size = stats.st_size
1797 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1798 mtype = mimetypes.guess_type(url)[0]
1799 headers = email.message_from_string(
1800 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1801 (mtype or 'text/plain', size, modified))
1802 if not host:
1803 urlfile = file
1804 if file[:1] == '/':
1805 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001806 return addinfourl(open(localname, 'rb'), headers, urlfile)
1807 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001808 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001809 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001810 urlfile = file
1811 if file[:1] == '/':
1812 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001813 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001814 raise URLError('local file error', 'not on local host')
1815
1816 def open_ftp(self, url):
1817 """Use FTP protocol."""
1818 if not isinstance(url, str):
1819 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1820 import mimetypes
Georg Brandl13e89462008-07-01 19:56:00 +00001821 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001822 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001823 host, port = splitport(host)
1824 user, host = splituser(host)
1825 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001826 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001827 host = unquote(host)
1828 user = unquote(user or '')
1829 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001830 host = socket.gethostbyname(host)
1831 if not port:
1832 import ftplib
1833 port = ftplib.FTP_PORT
1834 else:
1835 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001836 path, attrs = splitattr(path)
1837 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001838 dirs = path.split('/')
1839 dirs, file = dirs[:-1], dirs[-1]
1840 if dirs and not dirs[0]: dirs = dirs[1:]
1841 if dirs and not dirs[0]: dirs[0] = '/'
1842 key = user, host, port, '/'.join(dirs)
1843 # XXX thread unsafe!
1844 if len(self.ftpcache) > MAXFTPCACHE:
1845 # Prune the cache, rather arbitrarily
1846 for k in self.ftpcache.keys():
1847 if k != key:
1848 v = self.ftpcache[k]
1849 del self.ftpcache[k]
1850 v.close()
1851 try:
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001852 if key not in self.ftpcache:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001853 self.ftpcache[key] = \
1854 ftpwrapper(user, passwd, host, port, dirs)
1855 if not file: type = 'D'
1856 else: type = 'I'
1857 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001858 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001859 if attr.lower() == 'type' and \
1860 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1861 type = value.upper()
1862 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1863 mtype = mimetypes.guess_type("ftp:" + url)[0]
1864 headers = ""
1865 if mtype:
1866 headers += "Content-Type: %s\n" % mtype
1867 if retrlen is not None and retrlen >= 0:
1868 headers += "Content-Length: %d\n" % retrlen
1869 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001870 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001871 except ftperrors() as msg:
1872 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1873
1874 def open_data(self, url, data=None):
1875 """Use "data" URL."""
1876 if not isinstance(url, str):
1877 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1878 # ignore POSTed data
1879 #
1880 # syntax of data URLs:
1881 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1882 # mediatype := [ type "/" subtype ] *( ";" parameter )
1883 # data := *urlchar
1884 # parameter := attribute "=" value
1885 try:
1886 [type, data] = url.split(',', 1)
1887 except ValueError:
1888 raise IOError('data error', 'bad data URL')
1889 if not type:
1890 type = 'text/plain;charset=US-ASCII'
1891 semi = type.rfind(';')
1892 if semi >= 0 and '=' not in type[semi:]:
1893 encoding = type[semi+1:]
1894 type = type[:semi]
1895 else:
1896 encoding = ''
1897 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001898 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001899 time.gmtime(time.time())))
1900 msg.append('Content-type: %s' % type)
1901 if encoding == 'base64':
Georg Brandl706824f2009-06-04 09:42:55 +00001902 # XXX is this encoding/decoding ok?
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001903 data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001904 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001905 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001906 msg.append('Content-Length: %d' % len(data))
1907 msg.append('')
1908 msg.append(data)
1909 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001910 headers = email.message_from_string(msg)
1911 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001912 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001913 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001914
1915
1916class FancyURLopener(URLopener):
1917 """Derived class with handlers for errors we can handle (perhaps)."""
1918
1919 def __init__(self, *args, **kwargs):
1920 URLopener.__init__(self, *args, **kwargs)
1921 self.auth_cache = {}
1922 self.tries = 0
1923 self.maxtries = 10
1924
1925 def http_error_default(self, url, fp, errcode, errmsg, headers):
1926 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001927 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001928
1929 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1930 """Error 302 -- relocated (temporarily)."""
1931 self.tries += 1
1932 if self.maxtries and self.tries >= self.maxtries:
1933 if hasattr(self, "http_error_500"):
1934 meth = self.http_error_500
1935 else:
1936 meth = self.http_error_default
1937 self.tries = 0
1938 return meth(url, fp, 500,
1939 "Internal Server Error: Redirect Recursion", headers)
1940 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1941 data)
1942 self.tries = 0
1943 return result
1944
1945 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1946 if 'location' in headers:
1947 newurl = headers['location']
1948 elif 'uri' in headers:
1949 newurl = headers['uri']
1950 else:
1951 return
1952 void = fp.read()
1953 fp.close()
guido@google.coma119df92011-03-29 11:41:02 -07001954
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001955 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001956 newurl = urljoin(self.type + ":" + url, newurl)
guido@google.coma119df92011-03-29 11:41:02 -07001957
1958 urlparts = urlparse(newurl)
1959
1960 # For security reasons, we don't allow redirection to anything other
1961 # than http, https and ftp.
1962
1963 # We are using newer HTTPError with older redirect_internal method
1964 # This older method will get deprecated in 3.3
1965
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001966 if urlparts.scheme not in ('http', 'https', 'ftp'):
guido@google.coma119df92011-03-29 11:41:02 -07001967 raise HTTPError(newurl, errcode,
1968 errmsg +
1969 " Redirection to url '%s' is not allowed." % newurl,
1970 headers, fp)
1971
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001972 return self.open(newurl)
1973
1974 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1975 """Error 301 -- also relocated (permanently)."""
1976 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1977
1978 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1979 """Error 303 -- also relocated (essentially identical to 302)."""
1980 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1981
1982 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1983 """Error 307 -- relocated, but turn POST into error."""
1984 if data is None:
1985 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1986 else:
1987 return self.http_error_default(url, fp, errcode, errmsg, headers)
1988
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001989 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1990 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001991 """Error 401 -- authentication required.
1992 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001993 if 'www-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001994 URLopener.http_error_default(self, url, fp,
1995 errcode, errmsg, headers)
1996 stuff = headers['www-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001997 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1998 if not match:
1999 URLopener.http_error_default(self, url, fp,
2000 errcode, errmsg, headers)
2001 scheme, realm = match.groups()
2002 if scheme.lower() != 'basic':
2003 URLopener.http_error_default(self, url, fp,
2004 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002005 if not retry:
2006 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2007 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002008 name = 'retry_' + self.type + '_basic_auth'
2009 if data is None:
2010 return getattr(self,name)(url, realm)
2011 else:
2012 return getattr(self,name)(url, realm, data)
2013
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002014 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2015 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002016 """Error 407 -- proxy authentication required.
2017 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002018 if 'proxy-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002019 URLopener.http_error_default(self, url, fp,
2020 errcode, errmsg, headers)
2021 stuff = headers['proxy-authenticate']
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002022 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2023 if not match:
2024 URLopener.http_error_default(self, url, fp,
2025 errcode, errmsg, headers)
2026 scheme, realm = match.groups()
2027 if scheme.lower() != 'basic':
2028 URLopener.http_error_default(self, url, fp,
2029 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002030 if not retry:
2031 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2032 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002033 name = 'retry_proxy_' + self.type + '_basic_auth'
2034 if data is None:
2035 return getattr(self,name)(url, realm)
2036 else:
2037 return getattr(self,name)(url, realm, data)
2038
2039 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002040 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002041 newurl = 'http://' + host + selector
2042 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00002043 urltype, proxyhost = splittype(proxy)
2044 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002045 i = proxyhost.find('@') + 1
2046 proxyhost = proxyhost[i:]
2047 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2048 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002049 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002050 quote(passwd, safe=''), proxyhost)
2051 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2052 if data is None:
2053 return self.open(newurl)
2054 else:
2055 return self.open(newurl, data)
2056
2057 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002058 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002059 newurl = 'https://' + host + selector
2060 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00002061 urltype, proxyhost = splittype(proxy)
2062 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002063 i = proxyhost.find('@') + 1
2064 proxyhost = proxyhost[i:]
2065 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2066 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002067 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002068 quote(passwd, safe=''), proxyhost)
2069 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2070 if data is None:
2071 return self.open(newurl)
2072 else:
2073 return self.open(newurl, data)
2074
2075 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002076 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002077 i = host.find('@') + 1
2078 host = host[i:]
2079 user, passwd = self.get_user_passwd(host, realm, i)
2080 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002081 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002082 quote(passwd, safe=''), host)
2083 newurl = 'http://' + host + selector
2084 if data is None:
2085 return self.open(newurl)
2086 else:
2087 return self.open(newurl, data)
2088
2089 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002090 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002091 i = host.find('@') + 1
2092 host = host[i:]
2093 user, passwd = self.get_user_passwd(host, realm, i)
2094 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002095 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002096 quote(passwd, safe=''), host)
2097 newurl = 'https://' + host + selector
2098 if data is None:
2099 return self.open(newurl)
2100 else:
2101 return self.open(newurl, data)
2102
Florent Xicluna757445b2010-05-17 17:24:07 +00002103 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002104 key = realm + '@' + host.lower()
2105 if key in self.auth_cache:
2106 if clear_cache:
2107 del self.auth_cache[key]
2108 else:
2109 return self.auth_cache[key]
2110 user, passwd = self.prompt_user_passwd(host, realm)
2111 if user or passwd: self.auth_cache[key] = (user, passwd)
2112 return user, passwd
2113
2114 def prompt_user_passwd(self, host, realm):
2115 """Override this in a GUI environment!"""
2116 import getpass
2117 try:
2118 user = input("Enter username for %s at %s: " % (realm, host))
2119 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2120 (user, realm, host))
2121 return user, passwd
2122 except KeyboardInterrupt:
2123 print()
2124 return None, None
2125
2126
2127# Utility functions
2128
2129_localhost = None
2130def localhost():
2131 """Return the IP address of the magic hostname 'localhost'."""
2132 global _localhost
2133 if _localhost is None:
2134 _localhost = socket.gethostbyname('localhost')
2135 return _localhost
2136
2137_thishost = None
2138def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002139 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002140 global _thishost
2141 if _thishost is None:
Senthil Kumaran1b7da512011-10-06 00:32:02 +08002142 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002143 return _thishost
2144
2145_ftperrors = None
2146def ftperrors():
2147 """Return the set of errors raised by the FTP class."""
2148 global _ftperrors
2149 if _ftperrors is None:
2150 import ftplib
2151 _ftperrors = ftplib.all_errors
2152 return _ftperrors
2153
2154_noheaders = None
2155def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002156 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002157 global _noheaders
2158 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002159 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002160 return _noheaders
2161
2162
2163# Utility classes
2164
2165class ftpwrapper:
2166 """Class used by open_ftp() for cache of open FTP connections."""
2167
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002168 def __init__(self, user, passwd, host, port, dirs, timeout=None,
2169 persistent=True):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002170 self.user = user
2171 self.passwd = passwd
2172 self.host = host
2173 self.port = port
2174 self.dirs = dirs
2175 self.timeout = timeout
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002176 self.refcount = 0
2177 self.keepalive = persistent
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002178 self.init()
2179
2180 def init(self):
2181 import ftplib
2182 self.busy = 0
2183 self.ftp = ftplib.FTP()
2184 self.ftp.connect(self.host, self.port, self.timeout)
2185 self.ftp.login(self.user, self.passwd)
2186 for dir in self.dirs:
2187 self.ftp.cwd(dir)
2188
2189 def retrfile(self, file, type):
2190 import ftplib
2191 self.endtransfer()
2192 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2193 else: cmd = 'TYPE ' + type; isdir = 0
2194 try:
2195 self.ftp.voidcmd(cmd)
2196 except ftplib.all_errors:
2197 self.init()
2198 self.ftp.voidcmd(cmd)
2199 conn = None
2200 if file and not isdir:
2201 # Try to retrieve as a file
2202 try:
2203 cmd = 'RETR ' + file
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002204 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002205 except ftplib.error_perm as reason:
2206 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002207 raise URLError('ftp error', reason).with_traceback(
2208 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002209 if not conn:
2210 # Set transfer mode to ASCII!
2211 self.ftp.voidcmd('TYPE A')
2212 # Try a directory listing. Verify that directory exists.
2213 if file:
2214 pwd = self.ftp.pwd()
2215 try:
2216 try:
2217 self.ftp.cwd(file)
2218 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002219 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002220 finally:
2221 self.ftp.cwd(pwd)
2222 cmd = 'LIST ' + file
2223 else:
2224 cmd = 'LIST'
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002225 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002226 self.busy = 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002227
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002228 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2229 self.refcount += 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002230 conn.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002231 # Pass back both a suitably decorated object and a retrieval length
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002232 return (ftpobj, retrlen)
2233
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002234 def endtransfer(self):
2235 if not self.busy:
2236 return
2237 self.busy = 0
2238 try:
2239 self.ftp.voidresp()
2240 except ftperrors():
2241 pass
2242
2243 def close(self):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002244 self.keepalive = False
2245 if self.refcount <= 0:
2246 self.real_close()
2247
2248 def file_close(self):
2249 self.endtransfer()
2250 self.refcount -= 1
2251 if self.refcount <= 0 and not self.keepalive:
2252 self.real_close()
2253
2254 def real_close(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002255 self.endtransfer()
2256 try:
2257 self.ftp.close()
2258 except ftperrors():
2259 pass
2260
2261# Proxy handling
2262def getproxies_environment():
2263 """Return a dictionary of scheme -> proxy server URL mappings.
2264
2265 Scan the environment for variables named <scheme>_proxy;
2266 this seems to be the standard convention. If you need a
2267 different way, you can pass a proxies dictionary to the
2268 [Fancy]URLopener constructor.
2269
2270 """
2271 proxies = {}
2272 for name, value in os.environ.items():
2273 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002274 if value and name[-6:] == '_proxy':
2275 proxies[name[:-6]] = value
2276 return proxies
2277
2278def proxy_bypass_environment(host):
2279 """Test if proxies should not be used for a particular host.
2280
2281 Checks the environment for a variable named no_proxy, which should
2282 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2283 """
2284 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2285 # '*' is special case for always bypass
2286 if no_proxy == '*':
2287 return 1
2288 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002289 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002290 # check if the host ends with any of the DNS suffixes
Senthil Kumaran89976f12011-08-06 12:27:40 +08002291 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2292 for name in no_proxy_list:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002293 if name and (hostonly.endswith(name) or host.endswith(name)):
2294 return 1
2295 # otherwise, don't bypass
2296 return 0
2297
2298
Ronald Oussorene72e1612011-03-14 18:15:25 -04002299# This code tests an OSX specific data structure but is testable on all
2300# platforms
2301def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2302 """
2303 Return True iff this host shouldn't be accessed using a proxy
2304
2305 This function uses the MacOSX framework SystemConfiguration
2306 to fetch the proxy information.
2307
2308 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2309 { 'exclude_simple': bool,
2310 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2311 }
2312 """
Ronald Oussorene72e1612011-03-14 18:15:25 -04002313 from fnmatch import fnmatch
2314
2315 hostonly, port = splitport(host)
2316
2317 def ip2num(ipAddr):
2318 parts = ipAddr.split('.')
2319 parts = list(map(int, parts))
2320 if len(parts) != 4:
2321 parts = (parts + [0, 0, 0, 0])[:4]
2322 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2323
2324 # Check for simple host names:
2325 if '.' not in host:
2326 if proxy_settings['exclude_simple']:
2327 return True
2328
2329 hostIP = None
2330
2331 for value in proxy_settings.get('exceptions', ()):
2332 # Items in the list are strings like these: *.local, 169.254/16
2333 if not value: continue
2334
2335 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2336 if m is not None:
2337 if hostIP is None:
2338 try:
2339 hostIP = socket.gethostbyname(hostonly)
2340 hostIP = ip2num(hostIP)
2341 except socket.error:
2342 continue
2343
2344 base = ip2num(m.group(1))
2345 mask = m.group(2)
2346 if mask is None:
2347 mask = 8 * (m.group(1).count('.') + 1)
2348 else:
2349 mask = int(mask[1:])
2350 mask = 32 - mask
2351
2352 if (hostIP >> mask) == (base >> mask):
2353 return True
2354
2355 elif fnmatch(host, value):
2356 return True
2357
2358 return False
2359
2360
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002361if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002362 from _scproxy import _get_proxy_settings, _get_proxies
2363
2364 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren84151202010-04-18 20:46:11 +00002365 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002366 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren84151202010-04-18 20:46:11 +00002367
2368 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002369 """Return a dictionary of scheme -> proxy server URL mappings.
2370
Ronald Oussoren84151202010-04-18 20:46:11 +00002371 This function uses the MacOSX framework SystemConfiguration
2372 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002373 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002374 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002375
Ronald Oussoren84151202010-04-18 20:46:11 +00002376
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002377
2378 def proxy_bypass(host):
2379 if getproxies_environment():
2380 return proxy_bypass_environment(host)
2381 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002382 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002383
2384 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002385 return getproxies_environment() or getproxies_macosx_sysconf()
2386
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002387
2388elif os.name == 'nt':
2389 def getproxies_registry():
2390 """Return a dictionary of scheme -> proxy server URL mappings.
2391
2392 Win32 uses the registry to store proxies.
2393
2394 """
2395 proxies = {}
2396 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002397 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002398 except ImportError:
2399 # Std module, so should be around - but you never know!
2400 return proxies
2401 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002402 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002403 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002404 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002405 'ProxyEnable')[0]
2406 if proxyEnable:
2407 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002408 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002409 'ProxyServer')[0])
2410 if '=' in proxyServer:
2411 # Per-protocol settings
2412 for p in proxyServer.split(';'):
2413 protocol, address = p.split('=', 1)
2414 # See if address has a type:// prefix
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002415 if not re.match('^([^/:]+)://', address):
2416 address = '%s://%s' % (protocol, address)
2417 proxies[protocol] = address
2418 else:
2419 # Use one setting for all protocols
2420 if proxyServer[:5] == 'http:':
2421 proxies['http'] = proxyServer
2422 else:
2423 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002424 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002425 proxies['ftp'] = 'ftp://%s' % proxyServer
2426 internetSettings.Close()
2427 except (WindowsError, ValueError, TypeError):
2428 # Either registry key not found etc, or the value in an
2429 # unexpected format.
2430 # proxies already set up to be empty so nothing to do
2431 pass
2432 return proxies
2433
2434 def getproxies():
2435 """Return a dictionary of scheme -> proxy server URL mappings.
2436
2437 Returns settings gathered from the environment, if specified,
2438 or the registry.
2439
2440 """
2441 return getproxies_environment() or getproxies_registry()
2442
2443 def proxy_bypass_registry(host):
2444 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002445 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002446 except ImportError:
2447 # Std modules, so should be around - but you never know!
2448 return 0
2449 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002450 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002451 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002452 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002453 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002454 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002455 'ProxyOverride')[0])
2456 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2457 except WindowsError:
2458 return 0
2459 if not proxyEnable or not proxyOverride:
2460 return 0
2461 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002462 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002463 host = [rawHost]
2464 try:
2465 addr = socket.gethostbyname(rawHost)
2466 if addr != rawHost:
2467 host.append(addr)
2468 except socket.error:
2469 pass
2470 try:
2471 fqdn = socket.getfqdn(rawHost)
2472 if fqdn != rawHost:
2473 host.append(fqdn)
2474 except socket.error:
2475 pass
2476 # make a check value list from the registry entry: replace the
2477 # '<local>' string by the localhost entry and the corresponding
2478 # canonical entry.
2479 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002480 # now check if we match one of the registry values.
2481 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002482 if test == '<local>':
2483 if '.' not in rawHost:
2484 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002485 test = test.replace(".", r"\.") # mask dots
2486 test = test.replace("*", r".*") # change glob sequence
2487 test = test.replace("?", r".") # change glob char
2488 for val in host:
2489 # print "%s <--> %s" %( test, val )
2490 if re.match(test, val, re.I):
2491 return 1
2492 return 0
2493
2494 def proxy_bypass(host):
2495 """Return a dictionary of scheme -> proxy server URL mappings.
2496
2497 Returns settings gathered from the environment, if specified,
2498 or the registry.
2499
2500 """
2501 if getproxies_environment():
2502 return proxy_bypass_environment(host)
2503 else:
2504 return proxy_bypass_registry(host)
2505
2506else:
2507 # By default use environment variables
2508 getproxies = getproxies_environment
2509 proxy_bypass = proxy_bypass_environment