blob: 71011bda7100e1a2d0427ba0f80e3a6db4ca5b97 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000097import collections
Jeremy Hylton1afc1692008-06-18 20:49:58 +000098
Georg Brandl13e89462008-07-01 19:56:00 +000099from urllib.error import URLError, HTTPError, ContentTooShortError
100from urllib.parse import (
101 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
102 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000103 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000104from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106# check for SSL
107try:
108 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000109except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110 _have_ssl = False
111else:
112 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114# used in User-Agent header sent
115__version__ = sys.version[:3]
116
117_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000118def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
119 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000120 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000121 if cafile or capath:
122 if not _have_ssl:
123 raise ValueError('SSL support not available')
124 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
125 context.options |= ssl.OP_NO_SSLv2
126 if cafile or capath:
127 context.verify_mode = ssl.CERT_REQUIRED
128 context.load_verify_locations(cafile, capath)
129 check_hostname = True
130 else:
131 check_hostname = False
132 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
133 opener = build_opener(https_handler)
134 elif _opener is None:
135 _opener = opener = build_opener()
136 else:
137 opener = _opener
138 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000139
140def install_opener(opener):
141 global _opener
142 _opener = opener
143
144# TODO(jhylton): Make this work with the same global opener.
145_urlopener = None
146def urlretrieve(url, filename=None, reporthook=None, data=None):
147 global _urlopener
148 if not _urlopener:
149 _urlopener = FancyURLopener()
150 return _urlopener.retrieve(url, filename, reporthook, data)
151
152def urlcleanup():
153 if _urlopener:
154 _urlopener.cleanup()
155 global _opener
156 if _opener:
157 _opener = None
158
159# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000160_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000161def request_host(request):
162 """Return request-host, as defined by RFC 2965.
163
164 Variation from RFC: returned value is lowercased, for convenient
165 comparison.
166
167 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000168 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000169 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000170 if host == "":
171 host = request.get_header("Host", "")
172
173 # remove port, if present
174 host = _cut_port_re.sub("", host, 1)
175 return host.lower()
176
177class Request:
178
179 def __init__(self, url, data=None, headers={},
Senthil Kumarande49d642011-10-16 23:54:44 +0800180 origin_req_host=None, unverifiable=False,
181 method=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000182 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000183 self.full_url = unwrap(url)
Senthil Kumaran26430412011-04-13 07:01:19 +0800184 self.full_url, self.fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185 self.data = data
186 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000187 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 for key, value in headers.items():
189 self.add_header(key, value)
190 self.unredirected_hdrs = {}
191 if origin_req_host is None:
192 origin_req_host = request_host(self)
193 self.origin_req_host = origin_req_host
194 self.unverifiable = unverifiable
Senthil Kumarande49d642011-10-16 23:54:44 +0800195 self.method = method
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000196 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000197
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000198 def _parse(self):
199 self.type, rest = splittype(self.full_url)
200 if self.type is None:
201 raise ValueError("unknown url type: %s" % self.full_url)
202 self.host, self.selector = splithost(rest)
203 if self.host:
204 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000205
206 def get_method(self):
Senthil Kumarande49d642011-10-16 23:54:44 +0800207 """Return a string indicating the HTTP request method."""
208 if self.method is not None:
209 return self.method
210 elif self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211 return "POST"
212 else:
213 return "GET"
214
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000215 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000216
217 def add_data(self, data):
218 self.data = data
219
220 def has_data(self):
221 return self.data is not None
222
223 def get_data(self):
224 return self.data
225
226 def get_full_url(self):
Senthil Kumaran26430412011-04-13 07:01:19 +0800227 if self.fragment:
228 return '%s#%s' % (self.full_url, self.fragment)
229 else:
230 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000231
232 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000233 return self.type
234
235 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236 return self.host
237
238 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000239 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000240
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000241 def is_unverifiable(self):
242 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000243
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000244 def get_origin_req_host(self):
245 return self.origin_req_host
246
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000247 # End deprecated methods
248
249 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000250 if self.type == 'https' and not self._tunnel_host:
251 self._tunnel_host = self.host
252 else:
253 self.type= type
254 self.selector = self.full_url
255 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000256
257 def has_proxy(self):
258 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000259
260 def add_header(self, key, val):
261 # useful for something like authentication
262 self.headers[key.capitalize()] = val
263
264 def add_unredirected_header(self, key, val):
265 # will not be added to a redirected request
266 self.unredirected_hdrs[key.capitalize()] = val
267
268 def has_header(self, header_name):
269 return (header_name in self.headers or
270 header_name in self.unredirected_hdrs)
271
272 def get_header(self, header_name, default=None):
273 return self.headers.get(
274 header_name,
275 self.unredirected_hdrs.get(header_name, default))
276
277 def header_items(self):
278 hdrs = self.unredirected_hdrs.copy()
279 hdrs.update(self.headers)
280 return list(hdrs.items())
281
282class OpenerDirector:
283 def __init__(self):
284 client_version = "Python-urllib/%s" % __version__
285 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000286 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000287 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000288 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000289 self.handle_open = {}
290 self.handle_error = {}
291 self.process_response = {}
292 self.process_request = {}
293
294 def add_handler(self, handler):
295 if not hasattr(handler, "add_parent"):
296 raise TypeError("expected BaseHandler instance, got %r" %
297 type(handler))
298
299 added = False
300 for meth in dir(handler):
301 if meth in ["redirect_request", "do_open", "proxy_open"]:
302 # oops, coincidental match
303 continue
304
305 i = meth.find("_")
306 protocol = meth[:i]
307 condition = meth[i+1:]
308
309 if condition.startswith("error"):
310 j = condition.find("_") + i + 1
311 kind = meth[j+1:]
312 try:
313 kind = int(kind)
314 except ValueError:
315 pass
316 lookup = self.handle_error.get(protocol, {})
317 self.handle_error[protocol] = lookup
318 elif condition == "open":
319 kind = protocol
320 lookup = self.handle_open
321 elif condition == "response":
322 kind = protocol
323 lookup = self.process_response
324 elif condition == "request":
325 kind = protocol
326 lookup = self.process_request
327 else:
328 continue
329
330 handlers = lookup.setdefault(kind, [])
331 if handlers:
332 bisect.insort(handlers, handler)
333 else:
334 handlers.append(handler)
335 added = True
336
337 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000338 bisect.insort(self.handlers, handler)
339 handler.add_parent(self)
340
341 def close(self):
342 # Only exists for backwards compatibility.
343 pass
344
345 def _call_chain(self, chain, kind, meth_name, *args):
346 # Handlers raise an exception if no one else should try to handle
347 # the request, or return None if they can't but another handler
348 # could. Otherwise, they return the response.
349 handlers = chain.get(kind, ())
350 for handler in handlers:
351 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000352 result = func(*args)
353 if result is not None:
354 return result
355
356 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
357 # accept a URL or a Request object
358 if isinstance(fullurl, str):
359 req = Request(fullurl, data)
360 else:
361 req = fullurl
362 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000363 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000364
365 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000366 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000367
368 # pre-process request
369 meth_name = protocol+"_request"
370 for processor in self.process_request.get(protocol, []):
371 meth = getattr(processor, meth_name)
372 req = meth(req)
373
374 response = self._open(req, data)
375
376 # post-process response
377 meth_name = protocol+"_response"
378 for processor in self.process_response.get(protocol, []):
379 meth = getattr(processor, meth_name)
380 response = meth(req, response)
381
382 return response
383
384 def _open(self, req, data=None):
385 result = self._call_chain(self.handle_open, 'default',
386 'default_open', req)
387 if result:
388 return result
389
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000390 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000391 result = self._call_chain(self.handle_open, protocol, protocol +
392 '_open', req)
393 if result:
394 return result
395
396 return self._call_chain(self.handle_open, 'unknown',
397 'unknown_open', req)
398
399 def error(self, proto, *args):
400 if proto in ('http', 'https'):
401 # XXX http[s] protocols are special-cased
402 dict = self.handle_error['http'] # https is not different than http
403 proto = args[2] # YUCK!
404 meth_name = 'http_error_%s' % proto
405 http_err = 1
406 orig_args = args
407 else:
408 dict = self.handle_error
409 meth_name = proto + '_error'
410 http_err = 0
411 args = (dict, proto, meth_name) + args
412 result = self._call_chain(*args)
413 if result:
414 return result
415
416 if http_err:
417 args = (dict, 'default', 'http_error_default') + orig_args
418 return self._call_chain(*args)
419
420# XXX probably also want an abstract factory that knows when it makes
421# sense to skip a superclass in favor of a subclass and when it might
422# make sense to include both
423
424def build_opener(*handlers):
425 """Create an opener object from a list of handlers.
426
427 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000428 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000429
430 If any of the handlers passed as arguments are subclasses of the
431 default handlers, the default handlers will not be used.
432 """
433 def isclass(obj):
434 return isinstance(obj, type) or hasattr(obj, "__bases__")
435
436 opener = OpenerDirector()
437 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
438 HTTPDefaultErrorHandler, HTTPRedirectHandler,
439 FTPHandler, FileHandler, HTTPErrorProcessor]
440 if hasattr(http.client, "HTTPSConnection"):
441 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000442 skip = set()
443 for klass in default_classes:
444 for check in handlers:
445 if isclass(check):
446 if issubclass(check, klass):
447 skip.add(klass)
448 elif isinstance(check, klass):
449 skip.add(klass)
450 for klass in skip:
451 default_classes.remove(klass)
452
453 for klass in default_classes:
454 opener.add_handler(klass())
455
456 for h in handlers:
457 if isclass(h):
458 h = h()
459 opener.add_handler(h)
460 return opener
461
462class BaseHandler:
463 handler_order = 500
464
465 def add_parent(self, parent):
466 self.parent = parent
467
468 def close(self):
469 # Only exists for backwards compatibility
470 pass
471
472 def __lt__(self, other):
473 if not hasattr(other, "handler_order"):
474 # Try to preserve the old behavior of having custom classes
475 # inserted after default ones (works only for custom user
476 # classes which are not aware of handler_order).
477 return True
478 return self.handler_order < other.handler_order
479
480
481class HTTPErrorProcessor(BaseHandler):
482 """Process HTTP error responses."""
483 handler_order = 1000 # after all other processing
484
485 def http_response(self, request, response):
486 code, msg, hdrs = response.code, response.msg, response.info()
487
488 # According to RFC 2616, "2xx" code indicates that the client's
489 # request was successfully received, understood, and accepted.
490 if not (200 <= code < 300):
491 response = self.parent.error(
492 'http', request, response, code, msg, hdrs)
493
494 return response
495
496 https_response = http_response
497
498class HTTPDefaultErrorHandler(BaseHandler):
499 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000500 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000501
502class HTTPRedirectHandler(BaseHandler):
503 # maximum number of redirections to any single URL
504 # this is needed because of the state that cookies introduce
505 max_repeats = 4
506 # maximum total number of redirections (regardless of URL) before
507 # assuming we're in a loop
508 max_redirections = 10
509
510 def redirect_request(self, req, fp, code, msg, headers, newurl):
511 """Return a Request or None in response to a redirect.
512
513 This is called by the http_error_30x methods when a
514 redirection response is received. If a redirection should
515 take place, return a new Request to allow http_error_30x to
516 perform the redirect. Otherwise, raise HTTPError if no-one
517 else should try to handle this url. Return None if you can't
518 but another Handler might.
519 """
520 m = req.get_method()
521 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
522 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000523 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000524
525 # Strictly (according to RFC 2616), 301 or 302 in response to
526 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000527 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000528 # essentially all clients do redirect in this case, so we do
529 # the same.
530 # be conciliant with URIs containing a space
531 newurl = newurl.replace(' ', '%20')
532 CONTENT_HEADERS = ("content-length", "content-type")
533 newheaders = dict((k, v) for k, v in req.headers.items()
534 if k.lower() not in CONTENT_HEADERS)
535 return Request(newurl,
536 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000537 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000538 unverifiable=True)
539
540 # Implementation note: To avoid the server sending us into an
541 # infinite loop, the request object needs to track what URLs we
542 # have already seen. Do this by adding a handler-specific
543 # attribute to the Request object.
544 def http_error_302(self, req, fp, code, msg, headers):
545 # Some servers (incorrectly) return multiple Location headers
546 # (so probably same goes for URI). Use first header.
547 if "location" in headers:
548 newurl = headers["location"]
549 elif "uri" in headers:
550 newurl = headers["uri"]
551 else:
552 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000553
554 # fix a possible malformed URL
555 urlparts = urlparse(newurl)
guido@google.coma119df92011-03-29 11:41:02 -0700556
557 # For security reasons we don't allow redirection to anything other
558 # than http, https or ftp.
559
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800560 if urlparts.scheme not in ('http', 'https', 'ftp'):
561 raise HTTPError(
562 newurl, code,
563 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
564 headers, fp)
guido@google.coma119df92011-03-29 11:41:02 -0700565
Facundo Batistaf24802c2008-08-17 03:36:03 +0000566 if not urlparts.path:
567 urlparts = list(urlparts)
568 urlparts[2] = "/"
569 newurl = urlunparse(urlparts)
570
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000571 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000572
573 # XXX Probably want to forget about the state of the current
574 # request, although that might interact poorly with other
575 # handlers that also use handler-specific request attributes
576 new = self.redirect_request(req, fp, code, msg, headers, newurl)
577 if new is None:
578 return
579
580 # loop detection
581 # .redirect_dict has a key url if url was previously visited.
582 if hasattr(req, 'redirect_dict'):
583 visited = new.redirect_dict = req.redirect_dict
584 if (visited.get(newurl, 0) >= self.max_repeats or
585 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000586 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000587 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000588 else:
589 visited = new.redirect_dict = req.redirect_dict = {}
590 visited[newurl] = visited.get(newurl, 0) + 1
591
592 # Don't close the fp until we are sure that we won't use it
593 # with HTTPError.
594 fp.read()
595 fp.close()
596
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000597 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000598
599 http_error_301 = http_error_303 = http_error_307 = http_error_302
600
601 inf_msg = "The HTTP server returned a redirect error that would " \
602 "lead to an infinite loop.\n" \
603 "The last 30x error message was:\n"
604
605
606def _parse_proxy(proxy):
607 """Return (scheme, user, password, host/port) given a URL or an authority.
608
609 If a URL is supplied, it must have an authority (host:port) component.
610 According to RFC 3986, having an authority component means the URL must
611 have two slashes after the scheme:
612
613 >>> _parse_proxy('file:/ftp.example.com/')
614 Traceback (most recent call last):
615 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
616
617 The first three items of the returned tuple may be None.
618
619 Examples of authority parsing:
620
621 >>> _parse_proxy('proxy.example.com')
622 (None, None, None, 'proxy.example.com')
623 >>> _parse_proxy('proxy.example.com:3128')
624 (None, None, None, 'proxy.example.com:3128')
625
626 The authority component may optionally include userinfo (assumed to be
627 username:password):
628
629 >>> _parse_proxy('joe:password@proxy.example.com')
630 (None, 'joe', 'password', 'proxy.example.com')
631 >>> _parse_proxy('joe:password@proxy.example.com:3128')
632 (None, 'joe', 'password', 'proxy.example.com:3128')
633
634 Same examples, but with URLs instead:
635
636 >>> _parse_proxy('http://proxy.example.com/')
637 ('http', None, None, 'proxy.example.com')
638 >>> _parse_proxy('http://proxy.example.com:3128/')
639 ('http', None, None, 'proxy.example.com:3128')
640 >>> _parse_proxy('http://joe:password@proxy.example.com/')
641 ('http', 'joe', 'password', 'proxy.example.com')
642 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
643 ('http', 'joe', 'password', 'proxy.example.com:3128')
644
645 Everything after the authority is ignored:
646
647 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
648 ('ftp', 'joe', 'password', 'proxy.example.com')
649
650 Test for no trailing '/' case:
651
652 >>> _parse_proxy('http://joe:password@proxy.example.com')
653 ('http', 'joe', 'password', 'proxy.example.com')
654
655 """
Georg Brandl13e89462008-07-01 19:56:00 +0000656 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000657 if not r_scheme.startswith("/"):
658 # authority
659 scheme = None
660 authority = proxy
661 else:
662 # URL
663 if not r_scheme.startswith("//"):
664 raise ValueError("proxy URL with no authority: %r" % proxy)
665 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
666 # and 3.3.), path is empty or starts with '/'
667 end = r_scheme.find("/", 2)
668 if end == -1:
669 end = None
670 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000671 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000672 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000673 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000674 else:
675 user = password = None
676 return scheme, user, password, hostport
677
678class ProxyHandler(BaseHandler):
679 # Proxies must be in front
680 handler_order = 100
681
682 def __init__(self, proxies=None):
683 if proxies is None:
684 proxies = getproxies()
685 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
686 self.proxies = proxies
687 for type, url in proxies.items():
688 setattr(self, '%s_open' % type,
689 lambda r, proxy=url, type=type, meth=self.proxy_open: \
690 meth(r, proxy, type))
691
692 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000693 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000694 proxy_type, user, password, hostport = _parse_proxy(proxy)
695 if proxy_type is None:
696 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000697
698 if req.host and proxy_bypass(req.host):
699 return None
700
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000701 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000702 user_pass = '%s:%s' % (unquote(user),
703 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000704 creds = base64.b64encode(user_pass.encode()).decode("ascii")
705 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000706 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000707 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000708 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000709 # let other handlers take care of it
710 return None
711 else:
712 # need to start over, because the other handlers don't
713 # grok the proxy's URL type
714 # e.g. if we have a constructor arg proxies like so:
715 # {'http': 'ftp://proxy.example.com'}, we may end up turning
716 # a request for http://acme.example.com/a into one for
717 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000718 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000719
720class HTTPPasswordMgr:
721
722 def __init__(self):
723 self.passwd = {}
724
725 def add_password(self, realm, uri, user, passwd):
726 # uri could be a single URI or a sequence
727 if isinstance(uri, str):
728 uri = [uri]
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800729 if realm not in self.passwd:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000730 self.passwd[realm] = {}
731 for default_port in True, False:
732 reduced_uri = tuple(
733 [self.reduce_uri(u, default_port) for u in uri])
734 self.passwd[realm][reduced_uri] = (user, passwd)
735
736 def find_user_password(self, realm, authuri):
737 domains = self.passwd.get(realm, {})
738 for default_port in True, False:
739 reduced_authuri = self.reduce_uri(authuri, default_port)
740 for uris, authinfo in domains.items():
741 for uri in uris:
742 if self.is_suburi(uri, reduced_authuri):
743 return authinfo
744 return None, None
745
746 def reduce_uri(self, uri, default_port=True):
747 """Accept authority or URI and extract only the authority and path."""
748 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000749 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000750 if parts[1]:
751 # URI
752 scheme = parts[0]
753 authority = parts[1]
754 path = parts[2] or '/'
755 else:
756 # host or host:port
757 scheme = None
758 authority = uri
759 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000760 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000761 if default_port and port is None and scheme is not None:
762 dport = {"http": 80,
763 "https": 443,
764 }.get(scheme)
765 if dport is not None:
766 authority = "%s:%d" % (host, dport)
767 return authority, path
768
769 def is_suburi(self, base, test):
770 """Check if test is below base in a URI tree
771
772 Both args must be URIs in reduced form.
773 """
774 if base == test:
775 return True
776 if base[0] != test[0]:
777 return False
778 common = posixpath.commonprefix((base[1], test[1]))
779 if len(common) == len(base[1]):
780 return True
781 return False
782
783
784class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
785
786 def find_user_password(self, realm, authuri):
787 user, password = HTTPPasswordMgr.find_user_password(self, realm,
788 authuri)
789 if user is not None:
790 return user, password
791 return HTTPPasswordMgr.find_user_password(self, None, authuri)
792
793
794class AbstractBasicAuthHandler:
795
796 # XXX this allows for multiple auth-schemes, but will stupidly pick
797 # the last one with a realm specified.
798
799 # allow for double- and single-quoted realm values
800 # (single quotes are a violation of the RFC, but appear in the wild)
801 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
802 'realm=(["\'])(.*?)\\2', re.I)
803
804 # XXX could pre-emptively send auth info already accepted (RFC 2617,
805 # end of section 2, and section 1.2 immediately after "credentials"
806 # production).
807
808 def __init__(self, password_mgr=None):
809 if password_mgr is None:
810 password_mgr = HTTPPasswordMgr()
811 self.passwd = password_mgr
812 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000813 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000814
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000815 def reset_retry_count(self):
816 self.retried = 0
817
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000818 def http_error_auth_reqed(self, authreq, host, req, headers):
819 # host may be an authority (without userinfo) or a URL with an
820 # authority
821 # XXX could be multiple headers
822 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000823
824 if self.retried > 5:
825 # retry sending the username:password 5 times before failing.
826 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
827 headers, None)
828 else:
829 self.retried += 1
830
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000831 if authreq:
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800832 scheme = authreq.split()[0]
Senthil Kumaran1a129c82011-10-20 02:50:13 +0800833 if scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800834 raise ValueError("AbstractBasicAuthHandler does not"
835 " support the following scheme: '%s'" %
836 scheme)
837 else:
838 mo = AbstractBasicAuthHandler.rx.search(authreq)
839 if mo:
840 scheme, quote, realm = mo.groups()
841 if scheme.lower() == 'basic':
842 response = self.retry_http_basic_auth(host, req, realm)
843 if response and response.code != 401:
844 self.retried = 0
845 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000846
847 def retry_http_basic_auth(self, host, req, realm):
848 user, pw = self.passwd.find_user_password(realm, host)
849 if pw is not None:
850 raw = "%s:%s" % (user, pw)
851 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
852 if req.headers.get(self.auth_header, None) == auth:
853 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000854 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000855 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000856 else:
857 return None
858
859
860class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
861
862 auth_header = 'Authorization'
863
864 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000865 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000866 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000867 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000868 self.reset_retry_count()
869 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000870
871
872class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
873
874 auth_header = 'Proxy-authorization'
875
876 def http_error_407(self, req, fp, code, msg, headers):
877 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000878 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000879 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
880 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000881 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000882 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000883 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000884 self.reset_retry_count()
885 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000886
887
888def randombytes(n):
889 """Return n random bytes."""
890 return os.urandom(n)
891
892class AbstractDigestAuthHandler:
893 # Digest authentication is specified in RFC 2617.
894
895 # XXX The client does not inspect the Authentication-Info header
896 # in a successful response.
897
898 # XXX It should be possible to test this implementation against
899 # a mock server that just generates a static set of challenges.
900
901 # XXX qop="auth-int" supports is shaky
902
903 def __init__(self, passwd=None):
904 if passwd is None:
905 passwd = HTTPPasswordMgr()
906 self.passwd = passwd
907 self.add_password = self.passwd.add_password
908 self.retried = 0
909 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000910 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000911
912 def reset_retry_count(self):
913 self.retried = 0
914
915 def http_error_auth_reqed(self, auth_header, host, req, headers):
916 authreq = headers.get(auth_header, None)
917 if self.retried > 5:
918 # Don't fail endlessly - if we failed once, we'll probably
919 # fail a second time. Hm. Unless the Password Manager is
920 # prompting for the information. Crap. This isn't great
921 # but it's better than the current 'repeat until recursion
922 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000923 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000924 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000925 else:
926 self.retried += 1
927 if authreq:
928 scheme = authreq.split()[0]
929 if scheme.lower() == 'digest':
930 return self.retry_http_digest_auth(req, authreq)
Senthil Kumaran1a129c82011-10-20 02:50:13 +0800931 elif scheme.lower() != 'basic':
Senthil Kumaran4de00a22011-05-11 21:17:57 +0800932 raise ValueError("AbstractDigestAuthHandler does not support"
933 " the following scheme: '%s'" % scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000934
935 def retry_http_digest_auth(self, req, auth):
936 token, challenge = auth.split(' ', 1)
937 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
938 auth = self.get_authorization(req, chal)
939 if auth:
940 auth_val = 'Digest %s' % auth
941 if req.headers.get(self.auth_header, None) == auth_val:
942 return None
943 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000944 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000945 return resp
946
947 def get_cnonce(self, nonce):
948 # The cnonce-value is an opaque
949 # quoted string value provided by the client and used by both client
950 # and server to avoid chosen plaintext attacks, to provide mutual
951 # authentication, and to provide some message integrity protection.
952 # This isn't a fabulous effort, but it's probably Good Enough.
953 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
954 b = s.encode("ascii") + randombytes(8)
955 dig = hashlib.sha1(b).hexdigest()
956 return dig[:16]
957
958 def get_authorization(self, req, chal):
959 try:
960 realm = chal['realm']
961 nonce = chal['nonce']
962 qop = chal.get('qop')
963 algorithm = chal.get('algorithm', 'MD5')
964 # mod_digest doesn't send an opaque, even though it isn't
965 # supposed to be optional
966 opaque = chal.get('opaque', None)
967 except KeyError:
968 return None
969
970 H, KD = self.get_algorithm_impls(algorithm)
971 if H is None:
972 return None
973
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000974 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000975 if user is None:
976 return None
977
978 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000979 if req.data is not None:
980 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000981 else:
982 entdig = None
983
984 A1 = "%s:%s:%s" % (user, realm, pw)
985 A2 = "%s:%s" % (req.get_method(),
986 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000987 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000988 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000989 if nonce == self.last_nonce:
990 self.nonce_count += 1
991 else:
992 self.nonce_count = 1
993 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000994 ncvalue = '%08x' % self.nonce_count
995 cnonce = self.get_cnonce(nonce)
996 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
997 respdig = KD(H(A1), noncebit)
998 elif qop is None:
999 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1000 else:
1001 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +00001002 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001003
1004 # XXX should the partial digests be encoded too?
1005
1006 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001007 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001008 respdig)
1009 if opaque:
1010 base += ', opaque="%s"' % opaque
1011 if entdig:
1012 base += ', digest="%s"' % entdig
1013 base += ', algorithm="%s"' % algorithm
1014 if qop:
1015 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1016 return base
1017
1018 def get_algorithm_impls(self, algorithm):
1019 # lambdas assume digest modules are imported at the top level
1020 if algorithm == 'MD5':
1021 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1022 elif algorithm == 'SHA':
1023 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1024 # XXX MD5-sess
1025 KD = lambda s, d: H("%s:%s" % (s, d))
1026 return H, KD
1027
1028 def get_entity_digest(self, data, chal):
1029 # XXX not implemented yet
1030 return None
1031
1032
1033class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1034 """An authentication protocol defined by RFC 2069
1035
1036 Digest authentication improves on basic authentication because it
1037 does not transmit passwords in the clear.
1038 """
1039
1040 auth_header = 'Authorization'
1041 handler_order = 490 # before Basic auth
1042
1043 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001044 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001045 retry = self.http_error_auth_reqed('www-authenticate',
1046 host, req, headers)
1047 self.reset_retry_count()
1048 return retry
1049
1050
1051class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1052
1053 auth_header = 'Proxy-Authorization'
1054 handler_order = 490 # before Basic auth
1055
1056 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001057 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001058 retry = self.http_error_auth_reqed('proxy-authenticate',
1059 host, req, headers)
1060 self.reset_retry_count()
1061 return retry
1062
1063class AbstractHTTPHandler(BaseHandler):
1064
1065 def __init__(self, debuglevel=0):
1066 self._debuglevel = debuglevel
1067
1068 def set_http_debuglevel(self, level):
1069 self._debuglevel = level
1070
1071 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001072 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001073 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001074 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001075
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001076 if request.data is not None: # POST
1077 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001078 if isinstance(data, str):
1079 raise TypeError("POST data should be bytes"
1080 " or an iterable of bytes. It cannot be str.")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001081 if not request.has_header('Content-type'):
1082 request.add_unredirected_header(
1083 'Content-type',
1084 'application/x-www-form-urlencoded')
1085 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001086 try:
1087 mv = memoryview(data)
1088 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001089 if isinstance(data, collections.Iterable):
Georg Brandl61536042011-02-03 07:46:41 +00001090 raise ValueError("Content-Length should be specified "
1091 "for iterable data of type %r %r" % (type(data),
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001092 data))
1093 else:
1094 request.add_unredirected_header(
Senthil Kumaran1e991f22010-12-24 04:03:59 +00001095 'Content-length', '%d' % (len(mv) * mv.itemsize))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001096
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001097 sel_host = host
1098 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001099 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001100 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001101 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001102 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001103 for name, value in self.parent.addheaders:
1104 name = name.capitalize()
1105 if not request.has_header(name):
1106 request.add_unredirected_header(name, value)
1107
1108 return request
1109
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001110 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001111 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001112
1113 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001114 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001115 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001116 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001117 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001118
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001119 # will parse host:port
1120 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001121
1122 headers = dict(req.unredirected_hdrs)
1123 headers.update(dict((k, v) for k, v in req.headers.items()
1124 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001125
1126 # TODO(jhylton): Should this be redesigned to handle
1127 # persistent connections?
1128
1129 # We want to make an HTTP/1.1 request, but the addinfourl
1130 # class isn't prepared to deal with a persistent connection.
1131 # It will try to read all remaining data from the socket,
1132 # which will block while the server waits for the next request.
1133 # So make sure the connection gets closed after the (only)
1134 # request.
1135 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001136 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001137
1138 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001139 tunnel_headers = {}
1140 proxy_auth_hdr = "Proxy-Authorization"
1141 if proxy_auth_hdr in headers:
1142 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1143 # Proxy-Authorization should not be sent to origin
1144 # server.
1145 del headers[proxy_auth_hdr]
1146 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001147
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001148 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001149 h.request(req.get_method(), req.selector, req.data, headers)
Senthil Kumaran1299a8f2011-07-27 08:05:58 +08001150 except socket.error as err: # timeout error
Senthil Kumaran45686b42011-07-27 09:31:03 +08001151 h.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001152 raise URLError(err)
Senthil Kumaran45686b42011-07-27 09:31:03 +08001153 else:
1154 r = h.getresponse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001155
Senthil Kumaran26430412011-04-13 07:01:19 +08001156 r.url = req.get_full_url()
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001157 # This line replaces the .msg attribute of the HTTPResponse
1158 # with .headers, because urllib clients expect the response to
1159 # have the reason in .msg. It would be good to mark this
1160 # attribute is deprecated and get then to use info() or
1161 # .headers.
1162 r.msg = r.reason
1163 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001164
1165
1166class HTTPHandler(AbstractHTTPHandler):
1167
1168 def http_open(self, req):
1169 return self.do_open(http.client.HTTPConnection, req)
1170
1171 http_request = AbstractHTTPHandler.do_request_
1172
1173if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001174 import ssl
1175
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001176 class HTTPSHandler(AbstractHTTPHandler):
1177
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001178 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1179 AbstractHTTPHandler.__init__(self, debuglevel)
1180 self._context = context
1181 self._check_hostname = check_hostname
1182
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001183 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001184 return self.do_open(http.client.HTTPSConnection, req,
1185 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001186
1187 https_request = AbstractHTTPHandler.do_request_
1188
1189class HTTPCookieProcessor(BaseHandler):
1190 def __init__(self, cookiejar=None):
1191 import http.cookiejar
1192 if cookiejar is None:
1193 cookiejar = http.cookiejar.CookieJar()
1194 self.cookiejar = cookiejar
1195
1196 def http_request(self, request):
1197 self.cookiejar.add_cookie_header(request)
1198 return request
1199
1200 def http_response(self, request, response):
1201 self.cookiejar.extract_cookies(response, request)
1202 return response
1203
1204 https_request = http_request
1205 https_response = http_response
1206
1207class UnknownHandler(BaseHandler):
1208 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001209 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001210 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001211
1212def parse_keqv_list(l):
1213 """Parse list of key=value strings where keys are not duplicated."""
1214 parsed = {}
1215 for elt in l:
1216 k, v = elt.split('=', 1)
1217 if v[0] == '"' and v[-1] == '"':
1218 v = v[1:-1]
1219 parsed[k] = v
1220 return parsed
1221
1222def parse_http_list(s):
1223 """Parse lists as described by RFC 2068 Section 2.
1224
1225 In particular, parse comma-separated lists where the elements of
1226 the list may include quoted-strings. A quoted-string could
1227 contain a comma. A non-quoted string could have quotes in the
1228 middle. Neither commas nor quotes count if they are escaped.
1229 Only double-quotes count, not single-quotes.
1230 """
1231 res = []
1232 part = ''
1233
1234 escape = quote = False
1235 for cur in s:
1236 if escape:
1237 part += cur
1238 escape = False
1239 continue
1240 if quote:
1241 if cur == '\\':
1242 escape = True
1243 continue
1244 elif cur == '"':
1245 quote = False
1246 part += cur
1247 continue
1248
1249 if cur == ',':
1250 res.append(part)
1251 part = ''
1252 continue
1253
1254 if cur == '"':
1255 quote = True
1256
1257 part += cur
1258
1259 # append last part
1260 if part:
1261 res.append(part)
1262
1263 return [part.strip() for part in res]
1264
1265class FileHandler(BaseHandler):
1266 # Use local file or FTP depending on form of URL
1267 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001268 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001269 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1270 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001271 if not req.host is self.get_names():
1272 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001273 else:
1274 return self.open_local_file(req)
1275
1276 # names for the localhost
1277 names = None
1278 def get_names(self):
1279 if FileHandler.names is None:
1280 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001281 FileHandler.names = tuple(
1282 socket.gethostbyname_ex('localhost')[2] +
1283 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001284 except socket.gaierror:
1285 FileHandler.names = (socket.gethostbyname('localhost'),)
1286 return FileHandler.names
1287
1288 # not entirely sure what the rules are here
1289 def open_local_file(self, req):
1290 import email.utils
1291 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001292 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001293 filename = req.selector
1294 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001295 try:
1296 stats = os.stat(localfile)
1297 size = stats.st_size
1298 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001299 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001300 headers = email.message_from_string(
1301 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1302 (mtype or 'text/plain', size, modified))
1303 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001304 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001305 if not host or \
1306 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001307 if host:
1308 origurl = 'file://' + host + filename
1309 else:
1310 origurl = 'file://' + filename
1311 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001312 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001313 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001314 raise URLError(msg)
1315 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001316
1317def _safe_gethostbyname(host):
1318 try:
1319 return socket.gethostbyname(host)
1320 except socket.gaierror:
1321 return None
1322
1323class FTPHandler(BaseHandler):
1324 def ftp_open(self, req):
1325 import ftplib
1326 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001327 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001328 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001329 raise URLError('ftp error: no host given')
1330 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001331 if port is None:
1332 port = ftplib.FTP_PORT
1333 else:
1334 port = int(port)
1335
1336 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001337 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001338 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001339 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001340 else:
1341 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001342 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001343 user = user or ''
1344 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001345
1346 try:
1347 host = socket.gethostbyname(host)
1348 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001349 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001350 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001351 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001352 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001353 dirs, file = dirs[:-1], dirs[-1]
1354 if dirs and not dirs[0]:
1355 dirs = dirs[1:]
1356 try:
1357 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1358 type = file and 'I' or 'D'
1359 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001360 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001361 if attr.lower() == 'type' and \
1362 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1363 type = value.upper()
1364 fp, retrlen = fw.retrfile(file, type)
1365 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001366 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001367 if mtype:
1368 headers += "Content-type: %s\n" % mtype
1369 if retrlen is not None and retrlen >= 0:
1370 headers += "Content-length: %d\n" % retrlen
1371 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001372 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001373 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001374 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001375 raise exc.with_traceback(sys.exc_info()[2])
1376
1377 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001378 return ftpwrapper(user, passwd, host, port, dirs, timeout,
1379 persistent=False)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001380
1381class CacheFTPHandler(FTPHandler):
1382 # XXX would be nice to have pluggable cache strategies
1383 # XXX this stuff is definitely not thread safe
1384 def __init__(self):
1385 self.cache = {}
1386 self.timeout = {}
1387 self.soonest = 0
1388 self.delay = 60
1389 self.max_conns = 16
1390
1391 def setTimeout(self, t):
1392 self.delay = t
1393
1394 def setMaxConns(self, m):
1395 self.max_conns = m
1396
1397 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1398 key = user, host, port, '/'.join(dirs), timeout
1399 if key in self.cache:
1400 self.timeout[key] = time.time() + self.delay
1401 else:
1402 self.cache[key] = ftpwrapper(user, passwd, host, port,
1403 dirs, timeout)
1404 self.timeout[key] = time.time() + self.delay
1405 self.check_cache()
1406 return self.cache[key]
1407
1408 def check_cache(self):
1409 # first check for old ones
1410 t = time.time()
1411 if self.soonest <= t:
1412 for k, v in list(self.timeout.items()):
1413 if v < t:
1414 self.cache[k].close()
1415 del self.cache[k]
1416 del self.timeout[k]
1417 self.soonest = min(list(self.timeout.values()))
1418
1419 # then check the size
1420 if len(self.cache) == self.max_conns:
1421 for k, v in list(self.timeout.items()):
1422 if v == self.soonest:
1423 del self.cache[k]
1424 del self.timeout[k]
1425 break
1426 self.soonest = min(list(self.timeout.values()))
1427
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001428 def clear_cache(self):
1429 for conn in self.cache.values():
1430 conn.close()
1431 self.cache.clear()
1432 self.timeout.clear()
1433
1434
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001435# Code move from the old urllib module
1436
1437MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1438
1439# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001440if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001441 from nturl2path import url2pathname, pathname2url
1442else:
1443 def url2pathname(pathname):
1444 """OS-specific conversion from a relative URL of the 'file' scheme
1445 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001446 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001447
1448 def pathname2url(pathname):
1449 """OS-specific conversion from a file system path to a relative URL
1450 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001451 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001452
1453# This really consists of two pieces:
1454# (1) a class which handles opening of all sorts of URLs
1455# (plus assorted utilities etc.)
1456# (2) a set of functions for parsing URLs
1457# XXX Should these be separated out into different modules?
1458
1459
1460ftpcache = {}
1461class URLopener:
1462 """Class to open URLs.
1463 This is a class rather than just a subroutine because we may need
1464 more than one set of global protocol-specific options.
1465 Note -- this is a base class for those who don't want the
1466 automatic handling of errors type 302 (relocated) and 401
1467 (authorization needed)."""
1468
1469 __tempfiles = None
1470
1471 version = "Python-urllib/%s" % __version__
1472
1473 # Constructor
1474 def __init__(self, proxies=None, **x509):
1475 if proxies is None:
1476 proxies = getproxies()
1477 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1478 self.proxies = proxies
1479 self.key_file = x509.get('key_file')
1480 self.cert_file = x509.get('cert_file')
1481 self.addheaders = [('User-Agent', self.version)]
1482 self.__tempfiles = []
1483 self.__unlink = os.unlink # See cleanup()
1484 self.tempcache = None
1485 # Undocumented feature: if you assign {} to tempcache,
1486 # it is used to cache files retrieved with
1487 # self.retrieve(). This is not enabled by default
1488 # since it does not work for changing documents (and I
1489 # haven't got the logic to check expiration headers
1490 # yet).
1491 self.ftpcache = ftpcache
1492 # Undocumented feature: you can use a different
1493 # ftp cache by assigning to the .ftpcache member;
1494 # in case you want logically independent URL openers
1495 # XXX This is not threadsafe. Bah.
1496
1497 def __del__(self):
1498 self.close()
1499
1500 def close(self):
1501 self.cleanup()
1502
1503 def cleanup(self):
1504 # This code sometimes runs when the rest of this module
1505 # has already been deleted, so it can't use any globals
1506 # or import anything.
1507 if self.__tempfiles:
1508 for file in self.__tempfiles:
1509 try:
1510 self.__unlink(file)
1511 except OSError:
1512 pass
1513 del self.__tempfiles[:]
1514 if self.tempcache:
1515 self.tempcache.clear()
1516
1517 def addheader(self, *args):
1518 """Add a header to be used by the HTTP interface only
1519 e.g. u.addheader('Accept', 'sound/basic')"""
1520 self.addheaders.append(args)
1521
1522 # External interface
1523 def open(self, fullurl, data=None):
1524 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001525 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001526 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001527 if self.tempcache and fullurl in self.tempcache:
1528 filename, headers = self.tempcache[fullurl]
1529 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001530 return addinfourl(fp, headers, fullurl)
1531 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001532 if not urltype:
1533 urltype = 'file'
1534 if urltype in self.proxies:
1535 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001536 urltype, proxyhost = splittype(proxy)
1537 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001538 url = (host, fullurl) # Signal special case to open_*()
1539 else:
1540 proxy = None
1541 name = 'open_' + urltype
1542 self.type = urltype
1543 name = name.replace('-', '_')
1544 if not hasattr(self, name):
1545 if proxy:
1546 return self.open_unknown_proxy(proxy, fullurl, data)
1547 else:
1548 return self.open_unknown(fullurl, data)
1549 try:
1550 if data is None:
1551 return getattr(self, name)(url)
1552 else:
1553 return getattr(self, name)(url, data)
Antoine Pitrou6b4883d2011-10-12 02:54:14 +02001554 except HTTPError:
1555 raise
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001556 except socket.error as msg:
1557 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1558
1559 def open_unknown(self, fullurl, data=None):
1560 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001561 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001562 raise IOError('url error', 'unknown url type', type)
1563
1564 def open_unknown_proxy(self, proxy, fullurl, data=None):
1565 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001566 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001567 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1568
1569 # External interface
1570 def retrieve(self, url, filename=None, reporthook=None, data=None):
1571 """retrieve(url) returns (filename, headers) for a local object
1572 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001573 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001574 if self.tempcache and url in self.tempcache:
1575 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001576 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001577 if filename is None and (not type or type == 'file'):
1578 try:
1579 fp = self.open_local_file(url1)
1580 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001581 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001582 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001583 except IOError as msg:
1584 pass
1585 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001586 try:
1587 headers = fp.info()
1588 if filename:
1589 tfp = open(filename, 'wb')
1590 else:
1591 import tempfile
1592 garbage, path = splittype(url)
1593 garbage, path = splithost(path or "")
1594 path, garbage = splitquery(path or "")
1595 path, garbage = splitattr(path or "")
1596 suffix = os.path.splitext(path)[1]
1597 (fd, filename) = tempfile.mkstemp(suffix)
1598 self.__tempfiles.append(filename)
1599 tfp = os.fdopen(fd, 'wb')
1600 try:
1601 result = filename, headers
1602 if self.tempcache is not None:
1603 self.tempcache[url] = result
1604 bs = 1024*8
1605 size = -1
1606 read = 0
1607 blocknum = 0
1608 if reporthook:
1609 if "content-length" in headers:
1610 size = int(headers["Content-Length"])
1611 reporthook(blocknum, bs, size)
1612 while 1:
1613 block = fp.read(bs)
1614 if not block:
1615 break
1616 read += len(block)
1617 tfp.write(block)
1618 blocknum += 1
1619 if reporthook:
1620 reporthook(blocknum, bs, size)
1621 finally:
1622 tfp.close()
1623 finally:
1624 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001625
1626 # raise exception if actual size does not match content-length header
1627 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001628 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001629 "retrieval incomplete: got only %i out of %i bytes"
1630 % (read, size), result)
1631
1632 return result
1633
1634 # Each method named open_<type> knows how to open that type of URL
1635
1636 def _open_generic_http(self, connection_factory, url, data):
1637 """Make an HTTP connection using connection_class.
1638
1639 This is an internal method that should be called from
1640 open_http() or open_https().
1641
1642 Arguments:
1643 - connection_factory should take a host name and return an
1644 HTTPConnection instance.
1645 - url is the url to retrieval or a host, relative-path pair.
1646 - data is payload for a POST request or None.
1647 """
1648
1649 user_passwd = None
1650 proxy_passwd= None
1651 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001652 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001653 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001654 user_passwd, host = splituser(host)
1655 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001656 realhost = host
1657 else:
1658 host, selector = url
1659 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001660 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001661 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001662 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001663 url = rest
1664 user_passwd = None
1665 if urltype.lower() != 'http':
1666 realhost = None
1667 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001668 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001669 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001670 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001671 if user_passwd:
1672 selector = "%s://%s%s" % (urltype, realhost, rest)
1673 if proxy_bypass(realhost):
1674 host = realhost
1675
1676 #print "proxy via http:", host, selector
1677 if not host: raise IOError('http error', 'no host given')
1678
1679 if proxy_passwd:
1680 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001681 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001682 else:
1683 proxy_auth = None
1684
1685 if user_passwd:
1686 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001687 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001688 else:
1689 auth = None
1690 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001691 headers = {}
1692 if proxy_auth:
1693 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1694 if auth:
1695 headers["Authorization"] = "Basic %s" % auth
1696 if realhost:
1697 headers["Host"] = realhost
Senthil Kumarand91ffca2011-03-19 17:25:27 +08001698
1699 # Add Connection:close as we don't support persistent connections yet.
1700 # This helps in closing the socket and avoiding ResourceWarning
1701
1702 headers["Connection"] = "close"
1703
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001704 for header, value in self.addheaders:
1705 headers[header] = value
1706
1707 if data is not None:
1708 headers["Content-Type"] = "application/x-www-form-urlencoded"
1709 http_conn.request("POST", selector, data, headers)
1710 else:
1711 http_conn.request("GET", selector, headers=headers)
1712
1713 try:
1714 response = http_conn.getresponse()
1715 except http.client.BadStatusLine:
1716 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001717 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001718
1719 # According to RFC 2616, "2xx" code indicates that the client's
1720 # request was successfully received, understood, and accepted.
1721 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001722 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001723 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001724 else:
1725 return self.http_error(
1726 url, response.fp,
1727 response.status, response.reason, response.msg, data)
1728
1729 def open_http(self, url, data=None):
1730 """Use HTTP protocol."""
1731 return self._open_generic_http(http.client.HTTPConnection, url, data)
1732
1733 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1734 """Handle http errors.
1735
1736 Derived class can override this, or provide specific handlers
1737 named http_error_DDD where DDD is the 3-digit error code."""
1738 # First check if there's a specific handler for this error
1739 name = 'http_error_%d' % errcode
1740 if hasattr(self, name):
1741 method = getattr(self, name)
1742 if data is None:
1743 result = method(url, fp, errcode, errmsg, headers)
1744 else:
1745 result = method(url, fp, errcode, errmsg, headers, data)
1746 if result: return result
1747 return self.http_error_default(url, fp, errcode, errmsg, headers)
1748
1749 def http_error_default(self, url, fp, errcode, errmsg, headers):
1750 """Default error handler: close the connection and raise IOError."""
1751 void = fp.read()
1752 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001753 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001754
1755 if _have_ssl:
1756 def _https_connection(self, host):
1757 return http.client.HTTPSConnection(host,
1758 key_file=self.key_file,
1759 cert_file=self.cert_file)
1760
1761 def open_https(self, url, data=None):
1762 """Use HTTPS protocol."""
1763 return self._open_generic_http(self._https_connection, url, data)
1764
1765 def open_file(self, url):
1766 """Use local file or FTP depending on form of URL."""
1767 if not isinstance(url, str):
1768 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1769 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001770 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001771 else:
1772 return self.open_local_file(url)
1773
1774 def open_local_file(self, url):
1775 """Use local file."""
1776 import mimetypes, email.utils
1777 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001778 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001779 localname = url2pathname(file)
1780 try:
1781 stats = os.stat(localname)
1782 except OSError as e:
1783 raise URLError(e.errno, e.strerror, e.filename)
1784 size = stats.st_size
1785 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1786 mtype = mimetypes.guess_type(url)[0]
1787 headers = email.message_from_string(
1788 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1789 (mtype or 'text/plain', size, modified))
1790 if not host:
1791 urlfile = file
1792 if file[:1] == '/':
1793 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001794 return addinfourl(open(localname, 'rb'), headers, urlfile)
1795 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001796 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001797 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001798 urlfile = file
1799 if file[:1] == '/':
1800 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001801 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001802 raise URLError('local file error', 'not on local host')
1803
1804 def open_ftp(self, url):
1805 """Use FTP protocol."""
1806 if not isinstance(url, str):
1807 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1808 import mimetypes
1809 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001810 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001811 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001812 host, port = splitport(host)
1813 user, host = splituser(host)
1814 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001815 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001816 host = unquote(host)
1817 user = unquote(user or '')
1818 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001819 host = socket.gethostbyname(host)
1820 if not port:
1821 import ftplib
1822 port = ftplib.FTP_PORT
1823 else:
1824 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001825 path, attrs = splitattr(path)
1826 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001827 dirs = path.split('/')
1828 dirs, file = dirs[:-1], dirs[-1]
1829 if dirs and not dirs[0]: dirs = dirs[1:]
1830 if dirs and not dirs[0]: dirs[0] = '/'
1831 key = user, host, port, '/'.join(dirs)
1832 # XXX thread unsafe!
1833 if len(self.ftpcache) > MAXFTPCACHE:
1834 # Prune the cache, rather arbitrarily
1835 for k in self.ftpcache.keys():
1836 if k != key:
1837 v = self.ftpcache[k]
1838 del self.ftpcache[k]
1839 v.close()
1840 try:
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001841 if key not in self.ftpcache:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001842 self.ftpcache[key] = \
1843 ftpwrapper(user, passwd, host, port, dirs)
1844 if not file: type = 'D'
1845 else: type = 'I'
1846 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001847 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001848 if attr.lower() == 'type' and \
1849 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1850 type = value.upper()
1851 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1852 mtype = mimetypes.guess_type("ftp:" + url)[0]
1853 headers = ""
1854 if mtype:
1855 headers += "Content-Type: %s\n" % mtype
1856 if retrlen is not None and retrlen >= 0:
1857 headers += "Content-Length: %d\n" % retrlen
1858 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001859 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001860 except ftperrors() as msg:
1861 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1862
1863 def open_data(self, url, data=None):
1864 """Use "data" URL."""
1865 if not isinstance(url, str):
1866 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1867 # ignore POSTed data
1868 #
1869 # syntax of data URLs:
1870 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1871 # mediatype := [ type "/" subtype ] *( ";" parameter )
1872 # data := *urlchar
1873 # parameter := attribute "=" value
1874 try:
1875 [type, data] = url.split(',', 1)
1876 except ValueError:
1877 raise IOError('data error', 'bad data URL')
1878 if not type:
1879 type = 'text/plain;charset=US-ASCII'
1880 semi = type.rfind(';')
1881 if semi >= 0 and '=' not in type[semi:]:
1882 encoding = type[semi+1:]
1883 type = type[:semi]
1884 else:
1885 encoding = ''
1886 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001887 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001888 time.gmtime(time.time())))
1889 msg.append('Content-type: %s' % type)
1890 if encoding == 'base64':
1891 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001892 # XXX is this encoding/decoding ok?
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001893 data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001894 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001895 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001896 msg.append('Content-Length: %d' % len(data))
1897 msg.append('')
1898 msg.append(data)
1899 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001900 headers = email.message_from_string(msg)
1901 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001902 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001903 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001904
1905
1906class FancyURLopener(URLopener):
1907 """Derived class with handlers for errors we can handle (perhaps)."""
1908
1909 def __init__(self, *args, **kwargs):
1910 URLopener.__init__(self, *args, **kwargs)
1911 self.auth_cache = {}
1912 self.tries = 0
1913 self.maxtries = 10
1914
1915 def http_error_default(self, url, fp, errcode, errmsg, headers):
1916 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001917 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001918
1919 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1920 """Error 302 -- relocated (temporarily)."""
1921 self.tries += 1
1922 if self.maxtries and self.tries >= self.maxtries:
1923 if hasattr(self, "http_error_500"):
1924 meth = self.http_error_500
1925 else:
1926 meth = self.http_error_default
1927 self.tries = 0
1928 return meth(url, fp, 500,
1929 "Internal Server Error: Redirect Recursion", headers)
1930 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1931 data)
1932 self.tries = 0
1933 return result
1934
1935 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1936 if 'location' in headers:
1937 newurl = headers['location']
1938 elif 'uri' in headers:
1939 newurl = headers['uri']
1940 else:
1941 return
1942 void = fp.read()
1943 fp.close()
guido@google.coma119df92011-03-29 11:41:02 -07001944
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001945 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001946 newurl = urljoin(self.type + ":" + url, newurl)
guido@google.coma119df92011-03-29 11:41:02 -07001947
1948 urlparts = urlparse(newurl)
1949
1950 # For security reasons, we don't allow redirection to anything other
1951 # than http, https and ftp.
1952
1953 # We are using newer HTTPError with older redirect_internal method
1954 # This older method will get deprecated in 3.3
1955
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001956 if urlparts.scheme not in ('http', 'https', 'ftp'):
guido@google.coma119df92011-03-29 11:41:02 -07001957 raise HTTPError(newurl, errcode,
1958 errmsg +
1959 " Redirection to url '%s' is not allowed." % newurl,
1960 headers, fp)
1961
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001962 return self.open(newurl)
1963
1964 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1965 """Error 301 -- also relocated (permanently)."""
1966 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1967
1968 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1969 """Error 303 -- also relocated (essentially identical to 302)."""
1970 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1971
1972 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1973 """Error 307 -- relocated, but turn POST into error."""
1974 if data is None:
1975 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1976 else:
1977 return self.http_error_default(url, fp, errcode, errmsg, headers)
1978
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001979 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1980 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001981 """Error 401 -- authentication required.
1982 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001983 if 'www-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001984 URLopener.http_error_default(self, url, fp,
1985 errcode, errmsg, headers)
1986 stuff = headers['www-authenticate']
1987 import re
1988 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1989 if not match:
1990 URLopener.http_error_default(self, url, fp,
1991 errcode, errmsg, headers)
1992 scheme, realm = match.groups()
1993 if scheme.lower() != 'basic':
1994 URLopener.http_error_default(self, url, fp,
1995 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001996 if not retry:
1997 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1998 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001999 name = 'retry_' + self.type + '_basic_auth'
2000 if data is None:
2001 return getattr(self,name)(url, realm)
2002 else:
2003 return getattr(self,name)(url, realm, data)
2004
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002005 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2006 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002007 """Error 407 -- proxy authentication required.
2008 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08002009 if 'proxy-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002010 URLopener.http_error_default(self, url, fp,
2011 errcode, errmsg, headers)
2012 stuff = headers['proxy-authenticate']
2013 import re
2014 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2015 if not match:
2016 URLopener.http_error_default(self, url, fp,
2017 errcode, errmsg, headers)
2018 scheme, realm = match.groups()
2019 if scheme.lower() != 'basic':
2020 URLopener.http_error_default(self, url, fp,
2021 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002022 if not retry:
2023 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2024 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002025 name = 'retry_proxy_' + self.type + '_basic_auth'
2026 if data is None:
2027 return getattr(self,name)(url, realm)
2028 else:
2029 return getattr(self,name)(url, realm, data)
2030
2031 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002032 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002033 newurl = 'http://' + host + selector
2034 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00002035 urltype, proxyhost = splittype(proxy)
2036 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002037 i = proxyhost.find('@') + 1
2038 proxyhost = proxyhost[i:]
2039 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2040 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002041 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002042 quote(passwd, safe=''), proxyhost)
2043 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2044 if data is None:
2045 return self.open(newurl)
2046 else:
2047 return self.open(newurl, data)
2048
2049 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002050 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002051 newurl = 'https://' + host + selector
2052 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00002053 urltype, proxyhost = splittype(proxy)
2054 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002055 i = proxyhost.find('@') + 1
2056 proxyhost = proxyhost[i:]
2057 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2058 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002059 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002060 quote(passwd, safe=''), proxyhost)
2061 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2062 if data is None:
2063 return self.open(newurl)
2064 else:
2065 return self.open(newurl, data)
2066
2067 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002068 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002069 i = host.find('@') + 1
2070 host = host[i:]
2071 user, passwd = self.get_user_passwd(host, realm, i)
2072 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002073 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002074 quote(passwd, safe=''), host)
2075 newurl = 'http://' + host + selector
2076 if data is None:
2077 return self.open(newurl)
2078 else:
2079 return self.open(newurl, data)
2080
2081 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002082 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002083 i = host.find('@') + 1
2084 host = host[i:]
2085 user, passwd = self.get_user_passwd(host, realm, i)
2086 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002087 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002088 quote(passwd, safe=''), host)
2089 newurl = 'https://' + host + selector
2090 if data is None:
2091 return self.open(newurl)
2092 else:
2093 return self.open(newurl, data)
2094
Florent Xicluna757445b2010-05-17 17:24:07 +00002095 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002096 key = realm + '@' + host.lower()
2097 if key in self.auth_cache:
2098 if clear_cache:
2099 del self.auth_cache[key]
2100 else:
2101 return self.auth_cache[key]
2102 user, passwd = self.prompt_user_passwd(host, realm)
2103 if user or passwd: self.auth_cache[key] = (user, passwd)
2104 return user, passwd
2105
2106 def prompt_user_passwd(self, host, realm):
2107 """Override this in a GUI environment!"""
2108 import getpass
2109 try:
2110 user = input("Enter username for %s at %s: " % (realm, host))
2111 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2112 (user, realm, host))
2113 return user, passwd
2114 except KeyboardInterrupt:
2115 print()
2116 return None, None
2117
2118
2119# Utility functions
2120
2121_localhost = None
2122def localhost():
2123 """Return the IP address of the magic hostname 'localhost'."""
2124 global _localhost
2125 if _localhost is None:
2126 _localhost = socket.gethostbyname('localhost')
2127 return _localhost
2128
2129_thishost = None
2130def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002131 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002132 global _thishost
2133 if _thishost is None:
Senthil Kumaran1b7da512011-10-06 00:32:02 +08002134 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002135 return _thishost
2136
2137_ftperrors = None
2138def ftperrors():
2139 """Return the set of errors raised by the FTP class."""
2140 global _ftperrors
2141 if _ftperrors is None:
2142 import ftplib
2143 _ftperrors = ftplib.all_errors
2144 return _ftperrors
2145
2146_noheaders = None
2147def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002148 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002149 global _noheaders
2150 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002151 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002152 return _noheaders
2153
2154
2155# Utility classes
2156
2157class ftpwrapper:
2158 """Class used by open_ftp() for cache of open FTP connections."""
2159
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002160 def __init__(self, user, passwd, host, port, dirs, timeout=None,
2161 persistent=True):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002162 self.user = user
2163 self.passwd = passwd
2164 self.host = host
2165 self.port = port
2166 self.dirs = dirs
2167 self.timeout = timeout
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002168 self.refcount = 0
2169 self.keepalive = persistent
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002170 self.init()
2171
2172 def init(self):
2173 import ftplib
2174 self.busy = 0
2175 self.ftp = ftplib.FTP()
2176 self.ftp.connect(self.host, self.port, self.timeout)
2177 self.ftp.login(self.user, self.passwd)
2178 for dir in self.dirs:
2179 self.ftp.cwd(dir)
2180
2181 def retrfile(self, file, type):
2182 import ftplib
2183 self.endtransfer()
2184 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2185 else: cmd = 'TYPE ' + type; isdir = 0
2186 try:
2187 self.ftp.voidcmd(cmd)
2188 except ftplib.all_errors:
2189 self.init()
2190 self.ftp.voidcmd(cmd)
2191 conn = None
2192 if file and not isdir:
2193 # Try to retrieve as a file
2194 try:
2195 cmd = 'RETR ' + file
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002196 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002197 except ftplib.error_perm as reason:
2198 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002199 raise URLError('ftp error', reason).with_traceback(
2200 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002201 if not conn:
2202 # Set transfer mode to ASCII!
2203 self.ftp.voidcmd('TYPE A')
2204 # Try a directory listing. Verify that directory exists.
2205 if file:
2206 pwd = self.ftp.pwd()
2207 try:
2208 try:
2209 self.ftp.cwd(file)
2210 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002211 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002212 finally:
2213 self.ftp.cwd(pwd)
2214 cmd = 'LIST ' + file
2215 else:
2216 cmd = 'LIST'
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002217 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002218 self.busy = 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002219
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002220 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2221 self.refcount += 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002222 conn.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002223 # Pass back both a suitably decorated object and a retrieval length
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002224 return (ftpobj, retrlen)
2225
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002226 def endtransfer(self):
2227 if not self.busy:
2228 return
2229 self.busy = 0
2230 try:
2231 self.ftp.voidresp()
2232 except ftperrors():
2233 pass
2234
2235 def close(self):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002236 self.keepalive = False
2237 if self.refcount <= 0:
2238 self.real_close()
2239
2240 def file_close(self):
2241 self.endtransfer()
2242 self.refcount -= 1
2243 if self.refcount <= 0 and not self.keepalive:
2244 self.real_close()
2245
2246 def real_close(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002247 self.endtransfer()
2248 try:
2249 self.ftp.close()
2250 except ftperrors():
2251 pass
2252
2253# Proxy handling
2254def getproxies_environment():
2255 """Return a dictionary of scheme -> proxy server URL mappings.
2256
2257 Scan the environment for variables named <scheme>_proxy;
2258 this seems to be the standard convention. If you need a
2259 different way, you can pass a proxies dictionary to the
2260 [Fancy]URLopener constructor.
2261
2262 """
2263 proxies = {}
2264 for name, value in os.environ.items():
2265 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002266 if value and name[-6:] == '_proxy':
2267 proxies[name[:-6]] = value
2268 return proxies
2269
2270def proxy_bypass_environment(host):
2271 """Test if proxies should not be used for a particular host.
2272
2273 Checks the environment for a variable named no_proxy, which should
2274 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2275 """
2276 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2277 # '*' is special case for always bypass
2278 if no_proxy == '*':
2279 return 1
2280 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002281 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002282 # check if the host ends with any of the DNS suffixes
Senthil Kumaran89976f12011-08-06 12:27:40 +08002283 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2284 for name in no_proxy_list:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002285 if name and (hostonly.endswith(name) or host.endswith(name)):
2286 return 1
2287 # otherwise, don't bypass
2288 return 0
2289
2290
Ronald Oussorene72e1612011-03-14 18:15:25 -04002291# This code tests an OSX specific data structure but is testable on all
2292# platforms
2293def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2294 """
2295 Return True iff this host shouldn't be accessed using a proxy
2296
2297 This function uses the MacOSX framework SystemConfiguration
2298 to fetch the proxy information.
2299
2300 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2301 { 'exclude_simple': bool,
2302 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2303 }
2304 """
2305 import re
2306 import socket
2307 from fnmatch import fnmatch
2308
2309 hostonly, port = splitport(host)
2310
2311 def ip2num(ipAddr):
2312 parts = ipAddr.split('.')
2313 parts = list(map(int, parts))
2314 if len(parts) != 4:
2315 parts = (parts + [0, 0, 0, 0])[:4]
2316 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2317
2318 # Check for simple host names:
2319 if '.' not in host:
2320 if proxy_settings['exclude_simple']:
2321 return True
2322
2323 hostIP = None
2324
2325 for value in proxy_settings.get('exceptions', ()):
2326 # Items in the list are strings like these: *.local, 169.254/16
2327 if not value: continue
2328
2329 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2330 if m is not None:
2331 if hostIP is None:
2332 try:
2333 hostIP = socket.gethostbyname(hostonly)
2334 hostIP = ip2num(hostIP)
2335 except socket.error:
2336 continue
2337
2338 base = ip2num(m.group(1))
2339 mask = m.group(2)
2340 if mask is None:
2341 mask = 8 * (m.group(1).count('.') + 1)
2342 else:
2343 mask = int(mask[1:])
2344 mask = 32 - mask
2345
2346 if (hostIP >> mask) == (base >> mask):
2347 return True
2348
2349 elif fnmatch(host, value):
2350 return True
2351
2352 return False
2353
2354
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002355if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002356 from _scproxy import _get_proxy_settings, _get_proxies
2357
2358 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren84151202010-04-18 20:46:11 +00002359 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002360 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren84151202010-04-18 20:46:11 +00002361
2362 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002363 """Return a dictionary of scheme -> proxy server URL mappings.
2364
Ronald Oussoren84151202010-04-18 20:46:11 +00002365 This function uses the MacOSX framework SystemConfiguration
2366 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002367 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002368 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002369
Ronald Oussoren84151202010-04-18 20:46:11 +00002370
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002371
2372 def proxy_bypass(host):
2373 if getproxies_environment():
2374 return proxy_bypass_environment(host)
2375 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002376 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002377
2378 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002379 return getproxies_environment() or getproxies_macosx_sysconf()
2380
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002381
2382elif os.name == 'nt':
2383 def getproxies_registry():
2384 """Return a dictionary of scheme -> proxy server URL mappings.
2385
2386 Win32 uses the registry to store proxies.
2387
2388 """
2389 proxies = {}
2390 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002391 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002392 except ImportError:
2393 # Std module, so should be around - but you never know!
2394 return proxies
2395 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002396 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002397 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002398 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002399 'ProxyEnable')[0]
2400 if proxyEnable:
2401 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002402 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002403 'ProxyServer')[0])
2404 if '=' in proxyServer:
2405 # Per-protocol settings
2406 for p in proxyServer.split(';'):
2407 protocol, address = p.split('=', 1)
2408 # See if address has a type:// prefix
2409 import re
2410 if not re.match('^([^/:]+)://', address):
2411 address = '%s://%s' % (protocol, address)
2412 proxies[protocol] = address
2413 else:
2414 # Use one setting for all protocols
2415 if proxyServer[:5] == 'http:':
2416 proxies['http'] = proxyServer
2417 else:
2418 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002419 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002420 proxies['ftp'] = 'ftp://%s' % proxyServer
2421 internetSettings.Close()
2422 except (WindowsError, ValueError, TypeError):
2423 # Either registry key not found etc, or the value in an
2424 # unexpected format.
2425 # proxies already set up to be empty so nothing to do
2426 pass
2427 return proxies
2428
2429 def getproxies():
2430 """Return a dictionary of scheme -> proxy server URL mappings.
2431
2432 Returns settings gathered from the environment, if specified,
2433 or the registry.
2434
2435 """
2436 return getproxies_environment() or getproxies_registry()
2437
2438 def proxy_bypass_registry(host):
2439 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002440 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002441 import re
2442 except ImportError:
2443 # Std modules, so should be around - but you never know!
2444 return 0
2445 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002446 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002447 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002448 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002449 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002450 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002451 'ProxyOverride')[0])
2452 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2453 except WindowsError:
2454 return 0
2455 if not proxyEnable or not proxyOverride:
2456 return 0
2457 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002458 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002459 host = [rawHost]
2460 try:
2461 addr = socket.gethostbyname(rawHost)
2462 if addr != rawHost:
2463 host.append(addr)
2464 except socket.error:
2465 pass
2466 try:
2467 fqdn = socket.getfqdn(rawHost)
2468 if fqdn != rawHost:
2469 host.append(fqdn)
2470 except socket.error:
2471 pass
2472 # make a check value list from the registry entry: replace the
2473 # '<local>' string by the localhost entry and the corresponding
2474 # canonical entry.
2475 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002476 # now check if we match one of the registry values.
2477 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002478 if test == '<local>':
2479 if '.' not in rawHost:
2480 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002481 test = test.replace(".", r"\.") # mask dots
2482 test = test.replace("*", r".*") # change glob sequence
2483 test = test.replace("?", r".") # change glob char
2484 for val in host:
2485 # print "%s <--> %s" %( test, val )
2486 if re.match(test, val, re.I):
2487 return 1
2488 return 0
2489
2490 def proxy_bypass(host):
2491 """Return a dictionary of scheme -> proxy server URL mappings.
2492
2493 Returns settings gathered from the environment, if specified,
2494 or the registry.
2495
2496 """
2497 if getproxies_environment():
2498 return proxy_bypass_environment(host)
2499 else:
2500 return proxy_bypass_registry(host)
2501
2502else:
2503 # By default use environment variables
2504 getproxies = getproxies_environment
2505 proxy_bypass = proxy_bypass_environment