blob: d6f9f9a1bcaa8efc186d9499da9fb781eed84068 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000097import collections
Senthil Kumaran0ea91cb2012-05-15 23:59:42 +080098import warnings
Jeremy Hylton1afc1692008-06-18 20:49:58 +000099
Georg Brandl13e89462008-07-01 19:56:00 +0000100from urllib.error import URLError, HTTPError, ContentTooShortError
101from urllib.parse import (
102 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
103 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000104 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000105from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000106
107# check for SSL
108try:
109 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000110except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000111 _have_ssl = False
112else:
113 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000114
115# used in User-Agent header sent
116__version__ = sys.version[:3]
117
118_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000119def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
120 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000121 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000122 if cafile or capath:
123 if not _have_ssl:
124 raise ValueError('SSL support not available')
125 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
126 context.options |= ssl.OP_NO_SSLv2
127 if cafile or capath:
128 context.verify_mode = ssl.CERT_REQUIRED
129 context.load_verify_locations(cafile, capath)
130 check_hostname = True
131 else:
132 check_hostname = False
133 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
134 opener = build_opener(https_handler)
135 elif _opener is None:
136 _opener = opener = build_opener()
137 else:
138 opener = _opener
139 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000140
141def install_opener(opener):
142 global _opener
143 _opener = opener
144
145# TODO(jhylton): Make this work with the same global opener.
146_urlopener = None
147def urlretrieve(url, filename=None, reporthook=None, data=None):
148 global _urlopener
149 if not _urlopener:
150 _urlopener = FancyURLopener()
151 return _urlopener.retrieve(url, filename, reporthook, data)
152
153def urlcleanup():
154 if _urlopener:
155 _urlopener.cleanup()
156 global _opener
157 if _opener:
158 _opener = None
159
160# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000161_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000162def request_host(request):
163 """Return request-host, as defined by RFC 2965.
164
165 Variation from RFC: returned value is lowercased, for convenient
166 comparison.
167
168 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000169 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000170 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000171 if host == "":
172 host = request.get_header("Host", "")
173
174 # remove port, if present
175 host = _cut_port_re.sub("", host, 1)
176 return host.lower()
177
178class Request:
179
180 def __init__(self, url, data=None, headers={},
181 origin_req_host=None, unverifiable=False):
182 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Senthil Kumaran45ce4dc2012-07-08 02:08:48 -0700183 self.full_url = unwrap(url)
Senthil Kumaran26430412011-04-13 07:01:19 +0800184 self.full_url, self.fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185 self.data = data
186 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000187 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 for key, value in headers.items():
189 self.add_header(key, value)
190 self.unredirected_hdrs = {}
191 if origin_req_host is None:
192 origin_req_host = request_host(self)
193 self.origin_req_host = origin_req_host
194 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000195 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000196
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000197 def _parse(self):
198 self.type, rest = splittype(self.full_url)
199 if self.type is None:
200 raise ValueError("unknown url type: %s" % self.full_url)
201 self.host, self.selector = splithost(rest)
202 if self.host:
203 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000204
205 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000206 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000207 return "POST"
208 else:
209 return "GET"
210
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000211 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000212
213 def add_data(self, data):
214 self.data = data
215
216 def has_data(self):
217 return self.data is not None
218
219 def get_data(self):
220 return self.data
221
222 def get_full_url(self):
Senthil Kumaran26430412011-04-13 07:01:19 +0800223 if self.fragment:
224 return '%s#%s' % (self.full_url, self.fragment)
225 else:
226 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000227
228 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000229 return self.type
230
231 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000232 return self.host
233
234 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000235 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000237 def is_unverifiable(self):
238 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000239
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000240 def get_origin_req_host(self):
241 return self.origin_req_host
242
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000243 # End deprecated methods
244
245 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000246 if self.type == 'https' and not self._tunnel_host:
247 self._tunnel_host = self.host
248 else:
249 self.type= type
250 self.selector = self.full_url
251 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000252
253 def has_proxy(self):
254 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000255
256 def add_header(self, key, val):
257 # useful for something like authentication
258 self.headers[key.capitalize()] = val
259
260 def add_unredirected_header(self, key, val):
261 # will not be added to a redirected request
262 self.unredirected_hdrs[key.capitalize()] = val
263
264 def has_header(self, header_name):
265 return (header_name in self.headers or
266 header_name in self.unredirected_hdrs)
267
268 def get_header(self, header_name, default=None):
269 return self.headers.get(
270 header_name,
271 self.unredirected_hdrs.get(header_name, default))
272
273 def header_items(self):
274 hdrs = self.unredirected_hdrs.copy()
275 hdrs.update(self.headers)
276 return list(hdrs.items())
277
278class OpenerDirector:
279 def __init__(self):
280 client_version = "Python-urllib/%s" % __version__
281 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000282 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000283 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000284 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000285 self.handle_open = {}
286 self.handle_error = {}
287 self.process_response = {}
288 self.process_request = {}
289
290 def add_handler(self, handler):
291 if not hasattr(handler, "add_parent"):
292 raise TypeError("expected BaseHandler instance, got %r" %
293 type(handler))
294
295 added = False
296 for meth in dir(handler):
297 if meth in ["redirect_request", "do_open", "proxy_open"]:
298 # oops, coincidental match
299 continue
300
301 i = meth.find("_")
302 protocol = meth[:i]
303 condition = meth[i+1:]
304
305 if condition.startswith("error"):
306 j = condition.find("_") + i + 1
307 kind = meth[j+1:]
308 try:
309 kind = int(kind)
310 except ValueError:
311 pass
312 lookup = self.handle_error.get(protocol, {})
313 self.handle_error[protocol] = lookup
314 elif condition == "open":
315 kind = protocol
316 lookup = self.handle_open
317 elif condition == "response":
318 kind = protocol
319 lookup = self.process_response
320 elif condition == "request":
321 kind = protocol
322 lookup = self.process_request
323 else:
324 continue
325
326 handlers = lookup.setdefault(kind, [])
327 if handlers:
328 bisect.insort(handlers, handler)
329 else:
330 handlers.append(handler)
331 added = True
332
333 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000334 bisect.insort(self.handlers, handler)
335 handler.add_parent(self)
336
337 def close(self):
338 # Only exists for backwards compatibility.
339 pass
340
341 def _call_chain(self, chain, kind, meth_name, *args):
342 # Handlers raise an exception if no one else should try to handle
343 # the request, or return None if they can't but another handler
344 # could. Otherwise, they return the response.
345 handlers = chain.get(kind, ())
346 for handler in handlers:
347 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000348 result = func(*args)
349 if result is not None:
350 return result
351
352 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
353 # accept a URL or a Request object
354 if isinstance(fullurl, str):
355 req = Request(fullurl, data)
356 else:
357 req = fullurl
358 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000359 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000360
361 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000362 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000363
364 # pre-process request
365 meth_name = protocol+"_request"
366 for processor in self.process_request.get(protocol, []):
367 meth = getattr(processor, meth_name)
368 req = meth(req)
369
370 response = self._open(req, data)
371
372 # post-process response
373 meth_name = protocol+"_response"
374 for processor in self.process_response.get(protocol, []):
375 meth = getattr(processor, meth_name)
376 response = meth(req, response)
377
378 return response
379
380 def _open(self, req, data=None):
381 result = self._call_chain(self.handle_open, 'default',
382 'default_open', req)
383 if result:
384 return result
385
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000386 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000387 result = self._call_chain(self.handle_open, protocol, protocol +
388 '_open', req)
389 if result:
390 return result
391
392 return self._call_chain(self.handle_open, 'unknown',
393 'unknown_open', req)
394
395 def error(self, proto, *args):
396 if proto in ('http', 'https'):
397 # XXX http[s] protocols are special-cased
398 dict = self.handle_error['http'] # https is not different than http
399 proto = args[2] # YUCK!
400 meth_name = 'http_error_%s' % proto
401 http_err = 1
402 orig_args = args
403 else:
404 dict = self.handle_error
405 meth_name = proto + '_error'
406 http_err = 0
407 args = (dict, proto, meth_name) + args
408 result = self._call_chain(*args)
409 if result:
410 return result
411
412 if http_err:
413 args = (dict, 'default', 'http_error_default') + orig_args
414 return self._call_chain(*args)
415
416# XXX probably also want an abstract factory that knows when it makes
417# sense to skip a superclass in favor of a subclass and when it might
418# make sense to include both
419
420def build_opener(*handlers):
421 """Create an opener object from a list of handlers.
422
423 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000424 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000425
426 If any of the handlers passed as arguments are subclasses of the
427 default handlers, the default handlers will not be used.
428 """
429 def isclass(obj):
430 return isinstance(obj, type) or hasattr(obj, "__bases__")
431
432 opener = OpenerDirector()
433 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
434 HTTPDefaultErrorHandler, HTTPRedirectHandler,
435 FTPHandler, FileHandler, HTTPErrorProcessor]
436 if hasattr(http.client, "HTTPSConnection"):
437 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000438 skip = set()
439 for klass in default_classes:
440 for check in handlers:
441 if isclass(check):
442 if issubclass(check, klass):
443 skip.add(klass)
444 elif isinstance(check, klass):
445 skip.add(klass)
446 for klass in skip:
447 default_classes.remove(klass)
448
449 for klass in default_classes:
450 opener.add_handler(klass())
451
452 for h in handlers:
453 if isclass(h):
454 h = h()
455 opener.add_handler(h)
456 return opener
457
458class BaseHandler:
459 handler_order = 500
460
461 def add_parent(self, parent):
462 self.parent = parent
463
464 def close(self):
465 # Only exists for backwards compatibility
466 pass
467
468 def __lt__(self, other):
469 if not hasattr(other, "handler_order"):
470 # Try to preserve the old behavior of having custom classes
471 # inserted after default ones (works only for custom user
472 # classes which are not aware of handler_order).
473 return True
474 return self.handler_order < other.handler_order
475
476
477class HTTPErrorProcessor(BaseHandler):
478 """Process HTTP error responses."""
479 handler_order = 1000 # after all other processing
480
481 def http_response(self, request, response):
482 code, msg, hdrs = response.code, response.msg, response.info()
483
484 # According to RFC 2616, "2xx" code indicates that the client's
485 # request was successfully received, understood, and accepted.
486 if not (200 <= code < 300):
487 response = self.parent.error(
488 'http', request, response, code, msg, hdrs)
489
490 return response
491
492 https_response = http_response
493
494class HTTPDefaultErrorHandler(BaseHandler):
495 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000496 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000497
498class HTTPRedirectHandler(BaseHandler):
499 # maximum number of redirections to any single URL
500 # this is needed because of the state that cookies introduce
501 max_repeats = 4
502 # maximum total number of redirections (regardless of URL) before
503 # assuming we're in a loop
504 max_redirections = 10
505
506 def redirect_request(self, req, fp, code, msg, headers, newurl):
507 """Return a Request or None in response to a redirect.
508
509 This is called by the http_error_30x methods when a
510 redirection response is received. If a redirection should
511 take place, return a new Request to allow http_error_30x to
512 perform the redirect. Otherwise, raise HTTPError if no-one
513 else should try to handle this url. Return None if you can't
514 but another Handler might.
515 """
516 m = req.get_method()
517 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
518 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000519 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000520
521 # Strictly (according to RFC 2616), 301 or 302 in response to
522 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000523 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000524 # essentially all clients do redirect in this case, so we do
525 # the same.
526 # be conciliant with URIs containing a space
527 newurl = newurl.replace(' ', '%20')
528 CONTENT_HEADERS = ("content-length", "content-type")
529 newheaders = dict((k, v) for k, v in req.headers.items()
530 if k.lower() not in CONTENT_HEADERS)
531 return Request(newurl,
532 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000533 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000534 unverifiable=True)
535
536 # Implementation note: To avoid the server sending us into an
537 # infinite loop, the request object needs to track what URLs we
538 # have already seen. Do this by adding a handler-specific
539 # attribute to the Request object.
540 def http_error_302(self, req, fp, code, msg, headers):
541 # Some servers (incorrectly) return multiple Location headers
542 # (so probably same goes for URI). Use first header.
543 if "location" in headers:
544 newurl = headers["location"]
545 elif "uri" in headers:
546 newurl = headers["uri"]
547 else:
548 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000549
550 # fix a possible malformed URL
551 urlparts = urlparse(newurl)
guido@google.coma119df92011-03-29 11:41:02 -0700552
553 # For security reasons we don't allow redirection to anything other
554 # than http, https or ftp.
555
Senthil Kumaran6497aa32012-01-04 13:46:59 +0800556 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800557 raise HTTPError(
558 newurl, code,
559 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
560 headers, fp)
guido@google.coma119df92011-03-29 11:41:02 -0700561
Facundo Batistaf24802c2008-08-17 03:36:03 +0000562 if not urlparts.path:
563 urlparts = list(urlparts)
564 urlparts[2] = "/"
565 newurl = urlunparse(urlparts)
566
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000567 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000568
569 # XXX Probably want to forget about the state of the current
570 # request, although that might interact poorly with other
571 # handlers that also use handler-specific request attributes
572 new = self.redirect_request(req, fp, code, msg, headers, newurl)
573 if new is None:
574 return
575
576 # loop detection
577 # .redirect_dict has a key url if url was previously visited.
578 if hasattr(req, 'redirect_dict'):
579 visited = new.redirect_dict = req.redirect_dict
580 if (visited.get(newurl, 0) >= self.max_repeats or
581 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000582 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000583 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000584 else:
585 visited = new.redirect_dict = req.redirect_dict = {}
586 visited[newurl] = visited.get(newurl, 0) + 1
587
588 # Don't close the fp until we are sure that we won't use it
589 # with HTTPError.
590 fp.read()
591 fp.close()
592
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000593 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000594
595 http_error_301 = http_error_303 = http_error_307 = http_error_302
596
597 inf_msg = "The HTTP server returned a redirect error that would " \
598 "lead to an infinite loop.\n" \
599 "The last 30x error message was:\n"
600
601
602def _parse_proxy(proxy):
603 """Return (scheme, user, password, host/port) given a URL or an authority.
604
605 If a URL is supplied, it must have an authority (host:port) component.
606 According to RFC 3986, having an authority component means the URL must
607 have two slashes after the scheme:
608
609 >>> _parse_proxy('file:/ftp.example.com/')
610 Traceback (most recent call last):
611 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
612
613 The first three items of the returned tuple may be None.
614
615 Examples of authority parsing:
616
617 >>> _parse_proxy('proxy.example.com')
618 (None, None, None, 'proxy.example.com')
619 >>> _parse_proxy('proxy.example.com:3128')
620 (None, None, None, 'proxy.example.com:3128')
621
622 The authority component may optionally include userinfo (assumed to be
623 username:password):
624
625 >>> _parse_proxy('joe:password@proxy.example.com')
626 (None, 'joe', 'password', 'proxy.example.com')
627 >>> _parse_proxy('joe:password@proxy.example.com:3128')
628 (None, 'joe', 'password', 'proxy.example.com:3128')
629
630 Same examples, but with URLs instead:
631
632 >>> _parse_proxy('http://proxy.example.com/')
633 ('http', None, None, 'proxy.example.com')
634 >>> _parse_proxy('http://proxy.example.com:3128/')
635 ('http', None, None, 'proxy.example.com:3128')
636 >>> _parse_proxy('http://joe:password@proxy.example.com/')
637 ('http', 'joe', 'password', 'proxy.example.com')
638 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
639 ('http', 'joe', 'password', 'proxy.example.com:3128')
640
641 Everything after the authority is ignored:
642
643 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
644 ('ftp', 'joe', 'password', 'proxy.example.com')
645
646 Test for no trailing '/' case:
647
648 >>> _parse_proxy('http://joe:password@proxy.example.com')
649 ('http', 'joe', 'password', 'proxy.example.com')
650
651 """
Georg Brandl13e89462008-07-01 19:56:00 +0000652 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000653 if not r_scheme.startswith("/"):
654 # authority
655 scheme = None
656 authority = proxy
657 else:
658 # URL
659 if not r_scheme.startswith("//"):
660 raise ValueError("proxy URL with no authority: %r" % proxy)
661 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
662 # and 3.3.), path is empty or starts with '/'
663 end = r_scheme.find("/", 2)
664 if end == -1:
665 end = None
666 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000667 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000668 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000669 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000670 else:
671 user = password = None
672 return scheme, user, password, hostport
673
674class ProxyHandler(BaseHandler):
675 # Proxies must be in front
676 handler_order = 100
677
678 def __init__(self, proxies=None):
679 if proxies is None:
680 proxies = getproxies()
681 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
682 self.proxies = proxies
683 for type, url in proxies.items():
684 setattr(self, '%s_open' % type,
685 lambda r, proxy=url, type=type, meth=self.proxy_open: \
686 meth(r, proxy, type))
687
688 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000689 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000690 proxy_type, user, password, hostport = _parse_proxy(proxy)
691 if proxy_type is None:
692 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000693
694 if req.host and proxy_bypass(req.host):
695 return None
696
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000697 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000698 user_pass = '%s:%s' % (unquote(user),
699 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000700 creds = base64.b64encode(user_pass.encode()).decode("ascii")
701 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000702 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000703 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000704 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000705 # let other handlers take care of it
706 return None
707 else:
708 # need to start over, because the other handlers don't
709 # grok the proxy's URL type
710 # e.g. if we have a constructor arg proxies like so:
711 # {'http': 'ftp://proxy.example.com'}, we may end up turning
712 # a request for http://acme.example.com/a into one for
713 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000714 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000715
716class HTTPPasswordMgr:
717
718 def __init__(self):
719 self.passwd = {}
720
721 def add_password(self, realm, uri, user, passwd):
722 # uri could be a single URI or a sequence
723 if isinstance(uri, str):
724 uri = [uri]
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800725 if realm not in self.passwd:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000726 self.passwd[realm] = {}
727 for default_port in True, False:
728 reduced_uri = tuple(
729 [self.reduce_uri(u, default_port) for u in uri])
730 self.passwd[realm][reduced_uri] = (user, passwd)
731
732 def find_user_password(self, realm, authuri):
733 domains = self.passwd.get(realm, {})
734 for default_port in True, False:
735 reduced_authuri = self.reduce_uri(authuri, default_port)
736 for uris, authinfo in domains.items():
737 for uri in uris:
738 if self.is_suburi(uri, reduced_authuri):
739 return authinfo
740 return None, None
741
742 def reduce_uri(self, uri, default_port=True):
743 """Accept authority or URI and extract only the authority and path."""
744 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000745 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000746 if parts[1]:
747 # URI
748 scheme = parts[0]
749 authority = parts[1]
750 path = parts[2] or '/'
751 else:
752 # host or host:port
753 scheme = None
754 authority = uri
755 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000756 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000757 if default_port and port is None and scheme is not None:
758 dport = {"http": 80,
759 "https": 443,
760 }.get(scheme)
761 if dport is not None:
762 authority = "%s:%d" % (host, dport)
763 return authority, path
764
765 def is_suburi(self, base, test):
766 """Check if test is below base in a URI tree
767
768 Both args must be URIs in reduced form.
769 """
770 if base == test:
771 return True
772 if base[0] != test[0]:
773 return False
774 common = posixpath.commonprefix((base[1], test[1]))
775 if len(common) == len(base[1]):
776 return True
777 return False
778
779
780class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
781
782 def find_user_password(self, realm, authuri):
783 user, password = HTTPPasswordMgr.find_user_password(self, realm,
784 authuri)
785 if user is not None:
786 return user, password
787 return HTTPPasswordMgr.find_user_password(self, None, authuri)
788
789
790class AbstractBasicAuthHandler:
791
792 # XXX this allows for multiple auth-schemes, but will stupidly pick
793 # the last one with a realm specified.
794
795 # allow for double- and single-quoted realm values
796 # (single quotes are a violation of the RFC, but appear in the wild)
797 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
Senthil Kumaran34f3fcc2012-05-15 22:30:25 +0800798 'realm=(["\']?)([^"\']*)\\2', re.I)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000799
800 # XXX could pre-emptively send auth info already accepted (RFC 2617,
801 # end of section 2, and section 1.2 immediately after "credentials"
802 # production).
803
804 def __init__(self, password_mgr=None):
805 if password_mgr is None:
806 password_mgr = HTTPPasswordMgr()
807 self.passwd = password_mgr
808 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000809 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000810
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000811 def reset_retry_count(self):
812 self.retried = 0
813
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000814 def http_error_auth_reqed(self, authreq, host, req, headers):
815 # host may be an authority (without userinfo) or a URL with an
816 # authority
817 # XXX could be multiple headers
818 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000819
820 if self.retried > 5:
821 # retry sending the username:password 5 times before failing.
822 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
823 headers, None)
824 else:
825 self.retried += 1
826
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000827 if authreq:
828 mo = AbstractBasicAuthHandler.rx.search(authreq)
829 if mo:
830 scheme, quote, realm = mo.groups()
Senthil Kumaran0ea91cb2012-05-15 23:59:42 +0800831 if quote not in ["'", '"']:
832 warnings.warn("Basic Auth Realm was unquoted",
833 UserWarning, 2)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000834 if scheme.lower() == 'basic':
Senthil Kumaran4bb5c272010-08-26 06:16:22 +0000835 response = self.retry_http_basic_auth(host, req, realm)
836 if response and response.code != 401:
837 self.retried = 0
838 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000839
840 def retry_http_basic_auth(self, host, req, realm):
841 user, pw = self.passwd.find_user_password(realm, host)
842 if pw is not None:
843 raw = "%s:%s" % (user, pw)
844 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
845 if req.headers.get(self.auth_header, None) == auth:
846 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000847 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000848 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000849 else:
850 return None
851
852
853class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
854
855 auth_header = 'Authorization'
856
857 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000858 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000859 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000860 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000861 self.reset_retry_count()
862 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000863
864
865class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
866
867 auth_header = 'Proxy-authorization'
868
869 def http_error_407(self, req, fp, code, msg, headers):
870 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000871 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000872 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
873 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000874 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000875 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000876 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000877 self.reset_retry_count()
878 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000879
880
881def randombytes(n):
882 """Return n random bytes."""
883 return os.urandom(n)
884
885class AbstractDigestAuthHandler:
886 # Digest authentication is specified in RFC 2617.
887
888 # XXX The client does not inspect the Authentication-Info header
889 # in a successful response.
890
891 # XXX It should be possible to test this implementation against
892 # a mock server that just generates a static set of challenges.
893
894 # XXX qop="auth-int" supports is shaky
895
896 def __init__(self, passwd=None):
897 if passwd is None:
898 passwd = HTTPPasswordMgr()
899 self.passwd = passwd
900 self.add_password = self.passwd.add_password
901 self.retried = 0
902 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000903 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000904
905 def reset_retry_count(self):
906 self.retried = 0
907
908 def http_error_auth_reqed(self, auth_header, host, req, headers):
909 authreq = headers.get(auth_header, None)
910 if self.retried > 5:
911 # Don't fail endlessly - if we failed once, we'll probably
912 # fail a second time. Hm. Unless the Password Manager is
913 # prompting for the information. Crap. This isn't great
914 # but it's better than the current 'repeat until recursion
915 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000916 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000917 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000918 else:
919 self.retried += 1
920 if authreq:
921 scheme = authreq.split()[0]
922 if scheme.lower() == 'digest':
923 return self.retry_http_digest_auth(req, authreq)
924
925 def retry_http_digest_auth(self, req, auth):
926 token, challenge = auth.split(' ', 1)
927 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
928 auth = self.get_authorization(req, chal)
929 if auth:
930 auth_val = 'Digest %s' % auth
931 if req.headers.get(self.auth_header, None) == auth_val:
932 return None
933 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000934 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000935 return resp
936
937 def get_cnonce(self, nonce):
938 # The cnonce-value is an opaque
939 # quoted string value provided by the client and used by both client
940 # and server to avoid chosen plaintext attacks, to provide mutual
941 # authentication, and to provide some message integrity protection.
942 # This isn't a fabulous effort, but it's probably Good Enough.
943 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
944 b = s.encode("ascii") + randombytes(8)
945 dig = hashlib.sha1(b).hexdigest()
946 return dig[:16]
947
948 def get_authorization(self, req, chal):
949 try:
950 realm = chal['realm']
951 nonce = chal['nonce']
952 qop = chal.get('qop')
953 algorithm = chal.get('algorithm', 'MD5')
954 # mod_digest doesn't send an opaque, even though it isn't
955 # supposed to be optional
956 opaque = chal.get('opaque', None)
957 except KeyError:
958 return None
959
960 H, KD = self.get_algorithm_impls(algorithm)
961 if H is None:
962 return None
963
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000964 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000965 if user is None:
966 return None
967
968 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000969 if req.data is not None:
970 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000971 else:
972 entdig = None
973
974 A1 = "%s:%s:%s" % (user, realm, pw)
975 A2 = "%s:%s" % (req.get_method(),
976 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000977 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000978 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000979 if nonce == self.last_nonce:
980 self.nonce_count += 1
981 else:
982 self.nonce_count = 1
983 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000984 ncvalue = '%08x' % self.nonce_count
985 cnonce = self.get_cnonce(nonce)
986 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
987 respdig = KD(H(A1), noncebit)
988 elif qop is None:
989 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
990 else:
991 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000992 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000993
994 # XXX should the partial digests be encoded too?
995
996 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000997 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000998 respdig)
999 if opaque:
1000 base += ', opaque="%s"' % opaque
1001 if entdig:
1002 base += ', digest="%s"' % entdig
1003 base += ', algorithm="%s"' % algorithm
1004 if qop:
1005 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1006 return base
1007
1008 def get_algorithm_impls(self, algorithm):
1009 # lambdas assume digest modules are imported at the top level
1010 if algorithm == 'MD5':
1011 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1012 elif algorithm == 'SHA':
1013 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1014 # XXX MD5-sess
1015 KD = lambda s, d: H("%s:%s" % (s, d))
1016 return H, KD
1017
1018 def get_entity_digest(self, data, chal):
1019 # XXX not implemented yet
1020 return None
1021
1022
1023class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1024 """An authentication protocol defined by RFC 2069
1025
1026 Digest authentication improves on basic authentication because it
1027 does not transmit passwords in the clear.
1028 """
1029
1030 auth_header = 'Authorization'
1031 handler_order = 490 # before Basic auth
1032
1033 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001034 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001035 retry = self.http_error_auth_reqed('www-authenticate',
1036 host, req, headers)
1037 self.reset_retry_count()
1038 return retry
1039
1040
1041class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1042
1043 auth_header = 'Proxy-Authorization'
1044 handler_order = 490 # before Basic auth
1045
1046 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001047 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001048 retry = self.http_error_auth_reqed('proxy-authenticate',
1049 host, req, headers)
1050 self.reset_retry_count()
1051 return retry
1052
1053class AbstractHTTPHandler(BaseHandler):
1054
1055 def __init__(self, debuglevel=0):
1056 self._debuglevel = debuglevel
1057
1058 def set_http_debuglevel(self, level):
1059 self._debuglevel = level
1060
1061 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001062 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001063 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001064 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001065
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001066 if request.data is not None: # POST
1067 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001068 if isinstance(data, str):
Georg Brandl496660c2012-06-24 20:01:05 +02001069 msg = "POST data should be bytes or an iterable of bytes. "\
1070 "It cannot be of type str."
Senthil Kumaran6b3434a2012-03-15 18:11:16 -07001071 raise TypeError(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001072 if not request.has_header('Content-type'):
1073 request.add_unredirected_header(
1074 'Content-type',
1075 'application/x-www-form-urlencoded')
1076 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001077 try:
1078 mv = memoryview(data)
1079 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001080 if isinstance(data, collections.Iterable):
Georg Brandl61536042011-02-03 07:46:41 +00001081 raise ValueError("Content-Length should be specified "
1082 "for iterable data of type %r %r" % (type(data),
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001083 data))
1084 else:
1085 request.add_unredirected_header(
Senthil Kumaran1e991f22010-12-24 04:03:59 +00001086 'Content-length', '%d' % (len(mv) * mv.itemsize))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001087
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001088 sel_host = host
1089 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001090 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001091 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001092 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001093 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001094 for name, value in self.parent.addheaders:
1095 name = name.capitalize()
1096 if not request.has_header(name):
1097 request.add_unredirected_header(name, value)
1098
1099 return request
1100
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001101 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001102 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001103
1104 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001105 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001106 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001107 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001108 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001109
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001110 # will parse host:port
1111 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001112
1113 headers = dict(req.unredirected_hdrs)
1114 headers.update(dict((k, v) for k, v in req.headers.items()
1115 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001116
1117 # TODO(jhylton): Should this be redesigned to handle
1118 # persistent connections?
1119
1120 # We want to make an HTTP/1.1 request, but the addinfourl
1121 # class isn't prepared to deal with a persistent connection.
1122 # It will try to read all remaining data from the socket,
1123 # which will block while the server waits for the next request.
1124 # So make sure the connection gets closed after the (only)
1125 # request.
1126 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001127 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001128
1129 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001130 tunnel_headers = {}
1131 proxy_auth_hdr = "Proxy-Authorization"
1132 if proxy_auth_hdr in headers:
1133 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1134 # Proxy-Authorization should not be sent to origin
1135 # server.
1136 del headers[proxy_auth_hdr]
1137 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001138
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001139 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001140 h.request(req.get_method(), req.selector, req.data, headers)
Senthil Kumaran1299a8f2011-07-27 08:05:58 +08001141 except socket.error as err: # timeout error
Senthil Kumaran45686b42011-07-27 09:31:03 +08001142 h.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001143 raise URLError(err)
Senthil Kumaran45686b42011-07-27 09:31:03 +08001144 else:
1145 r = h.getresponse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001146
Senthil Kumaran26430412011-04-13 07:01:19 +08001147 r.url = req.get_full_url()
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001148 # This line replaces the .msg attribute of the HTTPResponse
1149 # with .headers, because urllib clients expect the response to
1150 # have the reason in .msg. It would be good to mark this
1151 # attribute is deprecated and get then to use info() or
1152 # .headers.
1153 r.msg = r.reason
1154 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001155
1156
1157class HTTPHandler(AbstractHTTPHandler):
1158
1159 def http_open(self, req):
1160 return self.do_open(http.client.HTTPConnection, req)
1161
1162 http_request = AbstractHTTPHandler.do_request_
1163
1164if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001165 import ssl
1166
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001167 class HTTPSHandler(AbstractHTTPHandler):
1168
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001169 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1170 AbstractHTTPHandler.__init__(self, debuglevel)
1171 self._context = context
1172 self._check_hostname = check_hostname
1173
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001174 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001175 return self.do_open(http.client.HTTPSConnection, req,
1176 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001177
1178 https_request = AbstractHTTPHandler.do_request_
1179
1180class HTTPCookieProcessor(BaseHandler):
1181 def __init__(self, cookiejar=None):
1182 import http.cookiejar
1183 if cookiejar is None:
1184 cookiejar = http.cookiejar.CookieJar()
1185 self.cookiejar = cookiejar
1186
1187 def http_request(self, request):
1188 self.cookiejar.add_cookie_header(request)
1189 return request
1190
1191 def http_response(self, request, response):
1192 self.cookiejar.extract_cookies(response, request)
1193 return response
1194
1195 https_request = http_request
1196 https_response = http_response
1197
1198class UnknownHandler(BaseHandler):
1199 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001200 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001201 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001202
1203def parse_keqv_list(l):
1204 """Parse list of key=value strings where keys are not duplicated."""
1205 parsed = {}
1206 for elt in l:
1207 k, v = elt.split('=', 1)
1208 if v[0] == '"' and v[-1] == '"':
1209 v = v[1:-1]
1210 parsed[k] = v
1211 return parsed
1212
1213def parse_http_list(s):
1214 """Parse lists as described by RFC 2068 Section 2.
1215
1216 In particular, parse comma-separated lists where the elements of
1217 the list may include quoted-strings. A quoted-string could
1218 contain a comma. A non-quoted string could have quotes in the
1219 middle. Neither commas nor quotes count if they are escaped.
1220 Only double-quotes count, not single-quotes.
1221 """
1222 res = []
1223 part = ''
1224
1225 escape = quote = False
1226 for cur in s:
1227 if escape:
1228 part += cur
1229 escape = False
1230 continue
1231 if quote:
1232 if cur == '\\':
1233 escape = True
1234 continue
1235 elif cur == '"':
1236 quote = False
1237 part += cur
1238 continue
1239
1240 if cur == ',':
1241 res.append(part)
1242 part = ''
1243 continue
1244
1245 if cur == '"':
1246 quote = True
1247
1248 part += cur
1249
1250 # append last part
1251 if part:
1252 res.append(part)
1253
1254 return [part.strip() for part in res]
1255
1256class FileHandler(BaseHandler):
1257 # Use local file or FTP depending on form of URL
1258 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001259 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001260 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1261 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001262 if not req.host is self.get_names():
1263 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001264 else:
1265 return self.open_local_file(req)
1266
1267 # names for the localhost
1268 names = None
1269 def get_names(self):
1270 if FileHandler.names is None:
1271 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001272 FileHandler.names = tuple(
1273 socket.gethostbyname_ex('localhost')[2] +
1274 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001275 except socket.gaierror:
1276 FileHandler.names = (socket.gethostbyname('localhost'),)
1277 return FileHandler.names
1278
1279 # not entirely sure what the rules are here
1280 def open_local_file(self, req):
1281 import email.utils
1282 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001283 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001284 filename = req.selector
1285 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001286 try:
1287 stats = os.stat(localfile)
1288 size = stats.st_size
1289 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001290 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001291 headers = email.message_from_string(
1292 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1293 (mtype or 'text/plain', size, modified))
1294 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001295 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001296 if not host or \
1297 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001298 if host:
1299 origurl = 'file://' + host + filename
1300 else:
1301 origurl = 'file://' + filename
1302 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001303 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001304 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001305 raise URLError(msg)
1306 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001307
1308def _safe_gethostbyname(host):
1309 try:
1310 return socket.gethostbyname(host)
1311 except socket.gaierror:
1312 return None
1313
1314class FTPHandler(BaseHandler):
1315 def ftp_open(self, req):
1316 import ftplib
1317 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001318 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001319 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001320 raise URLError('ftp error: no host given')
1321 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001322 if port is None:
1323 port = ftplib.FTP_PORT
1324 else:
1325 port = int(port)
1326
1327 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001328 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001329 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001330 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001331 else:
1332 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001333 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001334 user = user or ''
1335 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001336
1337 try:
1338 host = socket.gethostbyname(host)
1339 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001340 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001341 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001342 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001343 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001344 dirs, file = dirs[:-1], dirs[-1]
1345 if dirs and not dirs[0]:
1346 dirs = dirs[1:]
1347 try:
1348 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1349 type = file and 'I' or 'D'
1350 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001351 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001352 if attr.lower() == 'type' and \
1353 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1354 type = value.upper()
1355 fp, retrlen = fw.retrfile(file, type)
1356 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001357 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001358 if mtype:
1359 headers += "Content-type: %s\n" % mtype
1360 if retrlen is not None and retrlen >= 0:
1361 headers += "Content-length: %d\n" % retrlen
1362 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001363 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001364 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001365 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001366 raise exc.with_traceback(sys.exc_info()[2])
1367
1368 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001369 return ftpwrapper(user, passwd, host, port, dirs, timeout,
1370 persistent=False)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001371
1372class CacheFTPHandler(FTPHandler):
1373 # XXX would be nice to have pluggable cache strategies
1374 # XXX this stuff is definitely not thread safe
1375 def __init__(self):
1376 self.cache = {}
1377 self.timeout = {}
1378 self.soonest = 0
1379 self.delay = 60
1380 self.max_conns = 16
1381
1382 def setTimeout(self, t):
1383 self.delay = t
1384
1385 def setMaxConns(self, m):
1386 self.max_conns = m
1387
1388 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1389 key = user, host, port, '/'.join(dirs), timeout
1390 if key in self.cache:
1391 self.timeout[key] = time.time() + self.delay
1392 else:
1393 self.cache[key] = ftpwrapper(user, passwd, host, port,
1394 dirs, timeout)
1395 self.timeout[key] = time.time() + self.delay
1396 self.check_cache()
1397 return self.cache[key]
1398
1399 def check_cache(self):
1400 # first check for old ones
1401 t = time.time()
1402 if self.soonest <= t:
1403 for k, v in list(self.timeout.items()):
1404 if v < t:
1405 self.cache[k].close()
1406 del self.cache[k]
1407 del self.timeout[k]
1408 self.soonest = min(list(self.timeout.values()))
1409
1410 # then check the size
1411 if len(self.cache) == self.max_conns:
1412 for k, v in list(self.timeout.items()):
1413 if v == self.soonest:
1414 del self.cache[k]
1415 del self.timeout[k]
1416 break
1417 self.soonest = min(list(self.timeout.values()))
1418
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001419 def clear_cache(self):
1420 for conn in self.cache.values():
1421 conn.close()
1422 self.cache.clear()
1423 self.timeout.clear()
1424
1425
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001426# Code move from the old urllib module
1427
1428MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1429
1430# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001431if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001432 from nturl2path import url2pathname, pathname2url
1433else:
1434 def url2pathname(pathname):
1435 """OS-specific conversion from a relative URL of the 'file' scheme
1436 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001437 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001438
1439 def pathname2url(pathname):
1440 """OS-specific conversion from a file system path to a relative URL
1441 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001442 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001443
1444# This really consists of two pieces:
1445# (1) a class which handles opening of all sorts of URLs
1446# (plus assorted utilities etc.)
1447# (2) a set of functions for parsing URLs
1448# XXX Should these be separated out into different modules?
1449
1450
1451ftpcache = {}
1452class URLopener:
1453 """Class to open URLs.
1454 This is a class rather than just a subroutine because we may need
1455 more than one set of global protocol-specific options.
1456 Note -- this is a base class for those who don't want the
1457 automatic handling of errors type 302 (relocated) and 401
1458 (authorization needed)."""
1459
1460 __tempfiles = None
1461
1462 version = "Python-urllib/%s" % __version__
1463
1464 # Constructor
1465 def __init__(self, proxies=None, **x509):
1466 if proxies is None:
1467 proxies = getproxies()
1468 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1469 self.proxies = proxies
1470 self.key_file = x509.get('key_file')
1471 self.cert_file = x509.get('cert_file')
1472 self.addheaders = [('User-Agent', self.version)]
1473 self.__tempfiles = []
1474 self.__unlink = os.unlink # See cleanup()
1475 self.tempcache = None
1476 # Undocumented feature: if you assign {} to tempcache,
1477 # it is used to cache files retrieved with
1478 # self.retrieve(). This is not enabled by default
1479 # since it does not work for changing documents (and I
1480 # haven't got the logic to check expiration headers
1481 # yet).
1482 self.ftpcache = ftpcache
1483 # Undocumented feature: you can use a different
1484 # ftp cache by assigning to the .ftpcache member;
1485 # in case you want logically independent URL openers
1486 # XXX This is not threadsafe. Bah.
1487
1488 def __del__(self):
1489 self.close()
1490
1491 def close(self):
1492 self.cleanup()
1493
1494 def cleanup(self):
1495 # This code sometimes runs when the rest of this module
1496 # has already been deleted, so it can't use any globals
1497 # or import anything.
1498 if self.__tempfiles:
1499 for file in self.__tempfiles:
1500 try:
1501 self.__unlink(file)
1502 except OSError:
1503 pass
1504 del self.__tempfiles[:]
1505 if self.tempcache:
1506 self.tempcache.clear()
1507
1508 def addheader(self, *args):
1509 """Add a header to be used by the HTTP interface only
1510 e.g. u.addheader('Accept', 'sound/basic')"""
1511 self.addheaders.append(args)
1512
1513 # External interface
1514 def open(self, fullurl, data=None):
1515 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001516 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001517 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001518 if self.tempcache and fullurl in self.tempcache:
1519 filename, headers = self.tempcache[fullurl]
1520 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001521 return addinfourl(fp, headers, fullurl)
1522 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001523 if not urltype:
1524 urltype = 'file'
1525 if urltype in self.proxies:
1526 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001527 urltype, proxyhost = splittype(proxy)
1528 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001529 url = (host, fullurl) # Signal special case to open_*()
1530 else:
1531 proxy = None
1532 name = 'open_' + urltype
1533 self.type = urltype
1534 name = name.replace('-', '_')
1535 if not hasattr(self, name):
1536 if proxy:
1537 return self.open_unknown_proxy(proxy, fullurl, data)
1538 else:
1539 return self.open_unknown(fullurl, data)
1540 try:
1541 if data is None:
1542 return getattr(self, name)(url)
1543 else:
1544 return getattr(self, name)(url, data)
1545 except socket.error as msg:
1546 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1547
1548 def open_unknown(self, fullurl, data=None):
1549 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001550 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001551 raise IOError('url error', 'unknown url type', type)
1552
1553 def open_unknown_proxy(self, proxy, fullurl, data=None):
1554 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001555 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001556 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1557
1558 # External interface
1559 def retrieve(self, url, filename=None, reporthook=None, data=None):
1560 """retrieve(url) returns (filename, headers) for a local object
1561 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001562 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001563 if self.tempcache and url in self.tempcache:
1564 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001565 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001566 if filename is None and (not type or type == 'file'):
1567 try:
1568 fp = self.open_local_file(url1)
1569 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001570 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001571 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001572 except IOError as msg:
1573 pass
1574 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001575 try:
1576 headers = fp.info()
1577 if filename:
1578 tfp = open(filename, 'wb')
1579 else:
1580 import tempfile
1581 garbage, path = splittype(url)
1582 garbage, path = splithost(path or "")
1583 path, garbage = splitquery(path or "")
1584 path, garbage = splitattr(path or "")
1585 suffix = os.path.splitext(path)[1]
1586 (fd, filename) = tempfile.mkstemp(suffix)
1587 self.__tempfiles.append(filename)
1588 tfp = os.fdopen(fd, 'wb')
1589 try:
1590 result = filename, headers
1591 if self.tempcache is not None:
1592 self.tempcache[url] = result
1593 bs = 1024*8
1594 size = -1
1595 read = 0
1596 blocknum = 0
Senthil Kumarance260142011-11-01 01:35:17 +08001597 if "content-length" in headers:
1598 size = int(headers["Content-Length"])
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001599 if reporthook:
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001600 reporthook(blocknum, bs, size)
1601 while 1:
1602 block = fp.read(bs)
1603 if not block:
1604 break
1605 read += len(block)
1606 tfp.write(block)
1607 blocknum += 1
1608 if reporthook:
1609 reporthook(blocknum, bs, size)
1610 finally:
1611 tfp.close()
1612 finally:
1613 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001614
1615 # raise exception if actual size does not match content-length header
1616 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001617 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001618 "retrieval incomplete: got only %i out of %i bytes"
1619 % (read, size), result)
1620
1621 return result
1622
1623 # Each method named open_<type> knows how to open that type of URL
1624
1625 def _open_generic_http(self, connection_factory, url, data):
1626 """Make an HTTP connection using connection_class.
1627
1628 This is an internal method that should be called from
1629 open_http() or open_https().
1630
1631 Arguments:
1632 - connection_factory should take a host name and return an
1633 HTTPConnection instance.
1634 - url is the url to retrieval or a host, relative-path pair.
1635 - data is payload for a POST request or None.
1636 """
1637
1638 user_passwd = None
1639 proxy_passwd= None
1640 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001641 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001642 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001643 user_passwd, host = splituser(host)
1644 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001645 realhost = host
1646 else:
1647 host, selector = url
1648 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001649 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001650 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001651 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001652 url = rest
1653 user_passwd = None
1654 if urltype.lower() != 'http':
1655 realhost = None
1656 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001657 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001658 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001659 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001660 if user_passwd:
1661 selector = "%s://%s%s" % (urltype, realhost, rest)
1662 if proxy_bypass(realhost):
1663 host = realhost
1664
1665 #print "proxy via http:", host, selector
1666 if not host: raise IOError('http error', 'no host given')
1667
1668 if proxy_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001669 proxy_passwd = unquote(proxy_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001670 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001671 else:
1672 proxy_auth = None
1673
1674 if user_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001675 user_passwd = unquote(user_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001676 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001677 else:
1678 auth = None
1679 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001680 headers = {}
1681 if proxy_auth:
1682 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1683 if auth:
1684 headers["Authorization"] = "Basic %s" % auth
1685 if realhost:
1686 headers["Host"] = realhost
Senthil Kumarand91ffca2011-03-19 17:25:27 +08001687
1688 # Add Connection:close as we don't support persistent connections yet.
1689 # This helps in closing the socket and avoiding ResourceWarning
1690
1691 headers["Connection"] = "close"
1692
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001693 for header, value in self.addheaders:
1694 headers[header] = value
1695
1696 if data is not None:
1697 headers["Content-Type"] = "application/x-www-form-urlencoded"
1698 http_conn.request("POST", selector, data, headers)
1699 else:
1700 http_conn.request("GET", selector, headers=headers)
1701
1702 try:
1703 response = http_conn.getresponse()
1704 except http.client.BadStatusLine:
1705 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001706 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001707
1708 # According to RFC 2616, "2xx" code indicates that the client's
1709 # request was successfully received, understood, and accepted.
1710 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001711 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001712 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001713 else:
1714 return self.http_error(
1715 url, response.fp,
1716 response.status, response.reason, response.msg, data)
1717
1718 def open_http(self, url, data=None):
1719 """Use HTTP protocol."""
1720 return self._open_generic_http(http.client.HTTPConnection, url, data)
1721
1722 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1723 """Handle http errors.
1724
1725 Derived class can override this, or provide specific handlers
1726 named http_error_DDD where DDD is the 3-digit error code."""
1727 # First check if there's a specific handler for this error
1728 name = 'http_error_%d' % errcode
1729 if hasattr(self, name):
1730 method = getattr(self, name)
1731 if data is None:
1732 result = method(url, fp, errcode, errmsg, headers)
1733 else:
1734 result = method(url, fp, errcode, errmsg, headers, data)
1735 if result: return result
1736 return self.http_error_default(url, fp, errcode, errmsg, headers)
1737
1738 def http_error_default(self, url, fp, errcode, errmsg, headers):
1739 """Default error handler: close the connection and raise IOError."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001740 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001741 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001742
1743 if _have_ssl:
1744 def _https_connection(self, host):
1745 return http.client.HTTPSConnection(host,
1746 key_file=self.key_file,
1747 cert_file=self.cert_file)
1748
1749 def open_https(self, url, data=None):
1750 """Use HTTPS protocol."""
1751 return self._open_generic_http(self._https_connection, url, data)
1752
1753 def open_file(self, url):
1754 """Use local file or FTP depending on form of URL."""
1755 if not isinstance(url, str):
1756 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1757 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001758 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001759 else:
1760 return self.open_local_file(url)
1761
1762 def open_local_file(self, url):
1763 """Use local file."""
1764 import mimetypes, email.utils
1765 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001766 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001767 localname = url2pathname(file)
1768 try:
1769 stats = os.stat(localname)
1770 except OSError as e:
1771 raise URLError(e.errno, e.strerror, e.filename)
1772 size = stats.st_size
1773 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1774 mtype = mimetypes.guess_type(url)[0]
1775 headers = email.message_from_string(
1776 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1777 (mtype or 'text/plain', size, modified))
1778 if not host:
1779 urlfile = file
1780 if file[:1] == '/':
1781 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001782 return addinfourl(open(localname, 'rb'), headers, urlfile)
1783 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001784 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001785 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001786 urlfile = file
1787 if file[:1] == '/':
1788 urlfile = 'file://' + file
Senthil Kumaran3800ea92012-01-21 11:52:48 +08001789 elif file[:2] == './':
1790 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
Georg Brandl13e89462008-07-01 19:56:00 +00001791 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001792 raise URLError('local file error', 'not on local host')
1793
1794 def open_ftp(self, url):
1795 """Use FTP protocol."""
1796 if not isinstance(url, str):
1797 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1798 import mimetypes
1799 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001800 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001801 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001802 host, port = splitport(host)
1803 user, host = splituser(host)
1804 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001805 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001806 host = unquote(host)
1807 user = unquote(user or '')
1808 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001809 host = socket.gethostbyname(host)
1810 if not port:
1811 import ftplib
1812 port = ftplib.FTP_PORT
1813 else:
1814 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001815 path, attrs = splitattr(path)
1816 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001817 dirs = path.split('/')
1818 dirs, file = dirs[:-1], dirs[-1]
1819 if dirs and not dirs[0]: dirs = dirs[1:]
1820 if dirs and not dirs[0]: dirs[0] = '/'
1821 key = user, host, port, '/'.join(dirs)
1822 # XXX thread unsafe!
1823 if len(self.ftpcache) > MAXFTPCACHE:
1824 # Prune the cache, rather arbitrarily
1825 for k in self.ftpcache.keys():
1826 if k != key:
1827 v = self.ftpcache[k]
1828 del self.ftpcache[k]
1829 v.close()
1830 try:
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001831 if key not in self.ftpcache:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001832 self.ftpcache[key] = \
1833 ftpwrapper(user, passwd, host, port, dirs)
1834 if not file: type = 'D'
1835 else: type = 'I'
1836 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001837 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001838 if attr.lower() == 'type' and \
1839 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1840 type = value.upper()
1841 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1842 mtype = mimetypes.guess_type("ftp:" + url)[0]
1843 headers = ""
1844 if mtype:
1845 headers += "Content-Type: %s\n" % mtype
1846 if retrlen is not None and retrlen >= 0:
1847 headers += "Content-Length: %d\n" % retrlen
1848 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001849 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001850 except ftperrors() as msg:
1851 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1852
1853 def open_data(self, url, data=None):
1854 """Use "data" URL."""
1855 if not isinstance(url, str):
1856 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1857 # ignore POSTed data
1858 #
1859 # syntax of data URLs:
1860 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1861 # mediatype := [ type "/" subtype ] *( ";" parameter )
1862 # data := *urlchar
1863 # parameter := attribute "=" value
1864 try:
1865 [type, data] = url.split(',', 1)
1866 except ValueError:
1867 raise IOError('data error', 'bad data URL')
1868 if not type:
1869 type = 'text/plain;charset=US-ASCII'
1870 semi = type.rfind(';')
1871 if semi >= 0 and '=' not in type[semi:]:
1872 encoding = type[semi+1:]
1873 type = type[:semi]
1874 else:
1875 encoding = ''
1876 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001877 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001878 time.gmtime(time.time())))
1879 msg.append('Content-type: %s' % type)
1880 if encoding == 'base64':
Georg Brandl706824f2009-06-04 09:42:55 +00001881 # XXX is this encoding/decoding ok?
1882 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001883 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001884 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001885 msg.append('Content-Length: %d' % len(data))
1886 msg.append('')
1887 msg.append(data)
1888 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001889 headers = email.message_from_string(msg)
1890 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001891 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001892 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001893
1894
1895class FancyURLopener(URLopener):
1896 """Derived class with handlers for errors we can handle (perhaps)."""
1897
1898 def __init__(self, *args, **kwargs):
1899 URLopener.__init__(self, *args, **kwargs)
1900 self.auth_cache = {}
1901 self.tries = 0
1902 self.maxtries = 10
1903
1904 def http_error_default(self, url, fp, errcode, errmsg, headers):
1905 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001906 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001907
1908 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1909 """Error 302 -- relocated (temporarily)."""
1910 self.tries += 1
1911 if self.maxtries and self.tries >= self.maxtries:
1912 if hasattr(self, "http_error_500"):
1913 meth = self.http_error_500
1914 else:
1915 meth = self.http_error_default
1916 self.tries = 0
1917 return meth(url, fp, 500,
1918 "Internal Server Error: Redirect Recursion", headers)
1919 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1920 data)
1921 self.tries = 0
1922 return result
1923
1924 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1925 if 'location' in headers:
1926 newurl = headers['location']
1927 elif 'uri' in headers:
1928 newurl = headers['uri']
1929 else:
1930 return
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001931 fp.close()
guido@google.coma119df92011-03-29 11:41:02 -07001932
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001933 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001934 newurl = urljoin(self.type + ":" + url, newurl)
guido@google.coma119df92011-03-29 11:41:02 -07001935
1936 urlparts = urlparse(newurl)
1937
1938 # For security reasons, we don't allow redirection to anything other
1939 # than http, https and ftp.
1940
1941 # We are using newer HTTPError with older redirect_internal method
1942 # This older method will get deprecated in 3.3
1943
Senthil Kumaran6497aa32012-01-04 13:46:59 +08001944 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
guido@google.coma119df92011-03-29 11:41:02 -07001945 raise HTTPError(newurl, errcode,
1946 errmsg +
1947 " Redirection to url '%s' is not allowed." % newurl,
1948 headers, fp)
1949
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001950 return self.open(newurl)
1951
1952 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1953 """Error 301 -- also relocated (permanently)."""
1954 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1955
1956 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1957 """Error 303 -- also relocated (essentially identical to 302)."""
1958 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1959
1960 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1961 """Error 307 -- relocated, but turn POST into error."""
1962 if data is None:
1963 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1964 else:
1965 return self.http_error_default(url, fp, errcode, errmsg, headers)
1966
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001967 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1968 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001969 """Error 401 -- authentication required.
1970 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001971 if 'www-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001972 URLopener.http_error_default(self, url, fp,
1973 errcode, errmsg, headers)
1974 stuff = headers['www-authenticate']
1975 import re
1976 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1977 if not match:
1978 URLopener.http_error_default(self, url, fp,
1979 errcode, errmsg, headers)
1980 scheme, realm = match.groups()
1981 if scheme.lower() != 'basic':
1982 URLopener.http_error_default(self, url, fp,
1983 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001984 if not retry:
1985 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1986 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001987 name = 'retry_' + self.type + '_basic_auth'
1988 if data is None:
1989 return getattr(self,name)(url, realm)
1990 else:
1991 return getattr(self,name)(url, realm, data)
1992
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001993 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
1994 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001995 """Error 407 -- proxy authentication required.
1996 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001997 if 'proxy-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001998 URLopener.http_error_default(self, url, fp,
1999 errcode, errmsg, headers)
2000 stuff = headers['proxy-authenticate']
2001 import re
2002 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2003 if not match:
2004 URLopener.http_error_default(self, url, fp,
2005 errcode, errmsg, headers)
2006 scheme, realm = match.groups()
2007 if scheme.lower() != 'basic':
2008 URLopener.http_error_default(self, url, fp,
2009 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002010 if not retry:
2011 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2012 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002013 name = 'retry_proxy_' + self.type + '_basic_auth'
2014 if data is None:
2015 return getattr(self,name)(url, realm)
2016 else:
2017 return getattr(self,name)(url, realm, data)
2018
2019 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002020 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002021 newurl = 'http://' + host + selector
2022 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00002023 urltype, proxyhost = splittype(proxy)
2024 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002025 i = proxyhost.find('@') + 1
2026 proxyhost = proxyhost[i:]
2027 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2028 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002029 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002030 quote(passwd, safe=''), proxyhost)
2031 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2032 if data is None:
2033 return self.open(newurl)
2034 else:
2035 return self.open(newurl, data)
2036
2037 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002038 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002039 newurl = 'https://' + host + selector
2040 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00002041 urltype, proxyhost = splittype(proxy)
2042 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002043 i = proxyhost.find('@') + 1
2044 proxyhost = proxyhost[i:]
2045 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2046 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002047 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002048 quote(passwd, safe=''), proxyhost)
2049 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2050 if data is None:
2051 return self.open(newurl)
2052 else:
2053 return self.open(newurl, data)
2054
2055 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002056 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002057 i = host.find('@') + 1
2058 host = host[i:]
2059 user, passwd = self.get_user_passwd(host, realm, i)
2060 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002061 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002062 quote(passwd, safe=''), host)
2063 newurl = 'http://' + host + selector
2064 if data is None:
2065 return self.open(newurl)
2066 else:
2067 return self.open(newurl, data)
2068
2069 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002070 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002071 i = host.find('@') + 1
2072 host = host[i:]
2073 user, passwd = self.get_user_passwd(host, realm, i)
2074 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002075 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002076 quote(passwd, safe=''), host)
2077 newurl = 'https://' + host + selector
2078 if data is None:
2079 return self.open(newurl)
2080 else:
2081 return self.open(newurl, data)
2082
Florent Xicluna757445b2010-05-17 17:24:07 +00002083 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002084 key = realm + '@' + host.lower()
2085 if key in self.auth_cache:
2086 if clear_cache:
2087 del self.auth_cache[key]
2088 else:
2089 return self.auth_cache[key]
2090 user, passwd = self.prompt_user_passwd(host, realm)
2091 if user or passwd: self.auth_cache[key] = (user, passwd)
2092 return user, passwd
2093
2094 def prompt_user_passwd(self, host, realm):
2095 """Override this in a GUI environment!"""
2096 import getpass
2097 try:
2098 user = input("Enter username for %s at %s: " % (realm, host))
2099 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2100 (user, realm, host))
2101 return user, passwd
2102 except KeyboardInterrupt:
2103 print()
2104 return None, None
2105
2106
2107# Utility functions
2108
2109_localhost = None
2110def localhost():
2111 """Return the IP address of the magic hostname 'localhost'."""
2112 global _localhost
2113 if _localhost is None:
2114 _localhost = socket.gethostbyname('localhost')
2115 return _localhost
2116
2117_thishost = None
2118def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002119 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002120 global _thishost
2121 if _thishost is None:
Senthil Kumaran1b7da512011-10-06 00:32:02 +08002122 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002123 return _thishost
2124
2125_ftperrors = None
2126def ftperrors():
2127 """Return the set of errors raised by the FTP class."""
2128 global _ftperrors
2129 if _ftperrors is None:
2130 import ftplib
2131 _ftperrors = ftplib.all_errors
2132 return _ftperrors
2133
2134_noheaders = None
2135def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002136 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002137 global _noheaders
2138 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002139 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002140 return _noheaders
2141
2142
2143# Utility classes
2144
2145class ftpwrapper:
2146 """Class used by open_ftp() for cache of open FTP connections."""
2147
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002148 def __init__(self, user, passwd, host, port, dirs, timeout=None,
2149 persistent=True):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002150 self.user = user
2151 self.passwd = passwd
2152 self.host = host
2153 self.port = port
2154 self.dirs = dirs
2155 self.timeout = timeout
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002156 self.refcount = 0
2157 self.keepalive = persistent
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002158 self.init()
2159
2160 def init(self):
2161 import ftplib
2162 self.busy = 0
2163 self.ftp = ftplib.FTP()
2164 self.ftp.connect(self.host, self.port, self.timeout)
2165 self.ftp.login(self.user, self.passwd)
2166 for dir in self.dirs:
2167 self.ftp.cwd(dir)
2168
2169 def retrfile(self, file, type):
2170 import ftplib
2171 self.endtransfer()
2172 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2173 else: cmd = 'TYPE ' + type; isdir = 0
2174 try:
2175 self.ftp.voidcmd(cmd)
2176 except ftplib.all_errors:
2177 self.init()
2178 self.ftp.voidcmd(cmd)
2179 conn = None
2180 if file and not isdir:
2181 # Try to retrieve as a file
2182 try:
2183 cmd = 'RETR ' + file
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002184 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002185 except ftplib.error_perm as reason:
2186 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002187 raise URLError('ftp error', reason).with_traceback(
2188 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002189 if not conn:
2190 # Set transfer mode to ASCII!
2191 self.ftp.voidcmd('TYPE A')
2192 # Try a directory listing. Verify that directory exists.
2193 if file:
2194 pwd = self.ftp.pwd()
2195 try:
2196 try:
2197 self.ftp.cwd(file)
2198 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002199 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002200 finally:
2201 self.ftp.cwd(pwd)
2202 cmd = 'LIST ' + file
2203 else:
2204 cmd = 'LIST'
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002205 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002206 self.busy = 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002207
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002208 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2209 self.refcount += 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002210 conn.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002211 # Pass back both a suitably decorated object and a retrieval length
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002212 return (ftpobj, retrlen)
2213
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002214 def endtransfer(self):
2215 if not self.busy:
2216 return
2217 self.busy = 0
2218 try:
2219 self.ftp.voidresp()
2220 except ftperrors():
2221 pass
2222
2223 def close(self):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002224 self.keepalive = False
2225 if self.refcount <= 0:
2226 self.real_close()
2227
2228 def file_close(self):
2229 self.endtransfer()
2230 self.refcount -= 1
2231 if self.refcount <= 0 and not self.keepalive:
2232 self.real_close()
2233
2234 def real_close(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002235 self.endtransfer()
2236 try:
2237 self.ftp.close()
2238 except ftperrors():
2239 pass
2240
2241# Proxy handling
2242def getproxies_environment():
2243 """Return a dictionary of scheme -> proxy server URL mappings.
2244
2245 Scan the environment for variables named <scheme>_proxy;
2246 this seems to be the standard convention. If you need a
2247 different way, you can pass a proxies dictionary to the
2248 [Fancy]URLopener constructor.
2249
2250 """
2251 proxies = {}
2252 for name, value in os.environ.items():
2253 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002254 if value and name[-6:] == '_proxy':
2255 proxies[name[:-6]] = value
2256 return proxies
2257
2258def proxy_bypass_environment(host):
2259 """Test if proxies should not be used for a particular host.
2260
2261 Checks the environment for a variable named no_proxy, which should
2262 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2263 """
2264 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2265 # '*' is special case for always bypass
2266 if no_proxy == '*':
2267 return 1
2268 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002269 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002270 # check if the host ends with any of the DNS suffixes
Senthil Kumaran89976f12011-08-06 12:27:40 +08002271 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2272 for name in no_proxy_list:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002273 if name and (hostonly.endswith(name) or host.endswith(name)):
2274 return 1
2275 # otherwise, don't bypass
2276 return 0
2277
2278
Ronald Oussorene72e1612011-03-14 18:15:25 -04002279# This code tests an OSX specific data structure but is testable on all
2280# platforms
2281def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2282 """
2283 Return True iff this host shouldn't be accessed using a proxy
2284
2285 This function uses the MacOSX framework SystemConfiguration
2286 to fetch the proxy information.
2287
2288 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2289 { 'exclude_simple': bool,
2290 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2291 }
2292 """
2293 import re
2294 import socket
2295 from fnmatch import fnmatch
2296
2297 hostonly, port = splitport(host)
2298
2299 def ip2num(ipAddr):
2300 parts = ipAddr.split('.')
2301 parts = list(map(int, parts))
2302 if len(parts) != 4:
2303 parts = (parts + [0, 0, 0, 0])[:4]
2304 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2305
2306 # Check for simple host names:
2307 if '.' not in host:
2308 if proxy_settings['exclude_simple']:
2309 return True
2310
2311 hostIP = None
2312
2313 for value in proxy_settings.get('exceptions', ()):
2314 # Items in the list are strings like these: *.local, 169.254/16
2315 if not value: continue
2316
2317 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2318 if m is not None:
2319 if hostIP is None:
2320 try:
2321 hostIP = socket.gethostbyname(hostonly)
2322 hostIP = ip2num(hostIP)
2323 except socket.error:
2324 continue
2325
2326 base = ip2num(m.group(1))
2327 mask = m.group(2)
2328 if mask is None:
2329 mask = 8 * (m.group(1).count('.') + 1)
2330 else:
2331 mask = int(mask[1:])
2332 mask = 32 - mask
2333
2334 if (hostIP >> mask) == (base >> mask):
2335 return True
2336
2337 elif fnmatch(host, value):
2338 return True
2339
2340 return False
2341
2342
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002343if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002344 from _scproxy import _get_proxy_settings, _get_proxies
2345
2346 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren84151202010-04-18 20:46:11 +00002347 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002348 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren84151202010-04-18 20:46:11 +00002349
2350 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002351 """Return a dictionary of scheme -> proxy server URL mappings.
2352
Ronald Oussoren84151202010-04-18 20:46:11 +00002353 This function uses the MacOSX framework SystemConfiguration
2354 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002355 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002356 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002357
Ronald Oussoren84151202010-04-18 20:46:11 +00002358
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002359
2360 def proxy_bypass(host):
2361 if getproxies_environment():
2362 return proxy_bypass_environment(host)
2363 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002364 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002365
2366 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002367 return getproxies_environment() or getproxies_macosx_sysconf()
2368
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002369
2370elif os.name == 'nt':
2371 def getproxies_registry():
2372 """Return a dictionary of scheme -> proxy server URL mappings.
2373
2374 Win32 uses the registry to store proxies.
2375
2376 """
2377 proxies = {}
2378 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002379 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002380 except ImportError:
2381 # Std module, so should be around - but you never know!
2382 return proxies
2383 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002384 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002385 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002386 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002387 'ProxyEnable')[0]
2388 if proxyEnable:
2389 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002390 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002391 'ProxyServer')[0])
2392 if '=' in proxyServer:
2393 # Per-protocol settings
2394 for p in proxyServer.split(';'):
2395 protocol, address = p.split('=', 1)
2396 # See if address has a type:// prefix
2397 import re
2398 if not re.match('^([^/:]+)://', address):
2399 address = '%s://%s' % (protocol, address)
2400 proxies[protocol] = address
2401 else:
2402 # Use one setting for all protocols
2403 if proxyServer[:5] == 'http:':
2404 proxies['http'] = proxyServer
2405 else:
2406 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002407 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002408 proxies['ftp'] = 'ftp://%s' % proxyServer
2409 internetSettings.Close()
2410 except (WindowsError, ValueError, TypeError):
2411 # Either registry key not found etc, or the value in an
2412 # unexpected format.
2413 # proxies already set up to be empty so nothing to do
2414 pass
2415 return proxies
2416
2417 def getproxies():
2418 """Return a dictionary of scheme -> proxy server URL mappings.
2419
2420 Returns settings gathered from the environment, if specified,
2421 or the registry.
2422
2423 """
2424 return getproxies_environment() or getproxies_registry()
2425
2426 def proxy_bypass_registry(host):
2427 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002428 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002429 import re
2430 except ImportError:
2431 # Std modules, so should be around - but you never know!
2432 return 0
2433 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002434 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002435 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002436 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002437 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002438 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002439 'ProxyOverride')[0])
2440 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2441 except WindowsError:
2442 return 0
2443 if not proxyEnable or not proxyOverride:
2444 return 0
2445 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002446 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002447 host = [rawHost]
2448 try:
2449 addr = socket.gethostbyname(rawHost)
2450 if addr != rawHost:
2451 host.append(addr)
2452 except socket.error:
2453 pass
2454 try:
2455 fqdn = socket.getfqdn(rawHost)
2456 if fqdn != rawHost:
2457 host.append(fqdn)
2458 except socket.error:
2459 pass
2460 # make a check value list from the registry entry: replace the
2461 # '<local>' string by the localhost entry and the corresponding
2462 # canonical entry.
2463 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002464 # now check if we match one of the registry values.
2465 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002466 if test == '<local>':
2467 if '.' not in rawHost:
2468 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002469 test = test.replace(".", r"\.") # mask dots
2470 test = test.replace("*", r".*") # change glob sequence
2471 test = test.replace("?", r".") # change glob char
2472 for val in host:
2473 # print "%s <--> %s" %( test, val )
2474 if re.match(test, val, re.I):
2475 return 1
2476 return 0
2477
2478 def proxy_bypass(host):
2479 """Return a dictionary of scheme -> proxy server URL mappings.
2480
2481 Returns settings gathered from the environment, if specified,
2482 or the registry.
2483
2484 """
2485 if getproxies_environment():
2486 return proxy_bypass_environment(host)
2487 else:
2488 return proxy_bypass_registry(host)
2489
2490else:
2491 # By default use environment variables
2492 getproxies = getproxies_environment
2493 proxy_bypass = proxy_bypass_environment