blob: eb45c7eac6ee5b96fed85089aa3963a9ce3e3686 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000097import collections
Senthil Kumaran0ea91cb2012-05-15 23:59:42 +080098import warnings
Jeremy Hylton1afc1692008-06-18 20:49:58 +000099
Georg Brandl13e89462008-07-01 19:56:00 +0000100from urllib.error import URLError, HTTPError, ContentTooShortError
101from urllib.parse import (
102 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
103 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000104 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000105from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000106
107# check for SSL
108try:
109 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000110except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000111 _have_ssl = False
112else:
113 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000114
115# used in User-Agent header sent
116__version__ = sys.version[:3]
117
118_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000119def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
120 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000121 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000122 if cafile or capath:
123 if not _have_ssl:
124 raise ValueError('SSL support not available')
125 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
126 context.options |= ssl.OP_NO_SSLv2
127 if cafile or capath:
128 context.verify_mode = ssl.CERT_REQUIRED
129 context.load_verify_locations(cafile, capath)
130 check_hostname = True
131 else:
132 check_hostname = False
133 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
134 opener = build_opener(https_handler)
135 elif _opener is None:
136 _opener = opener = build_opener()
137 else:
138 opener = _opener
139 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000140
141def install_opener(opener):
142 global _opener
143 _opener = opener
144
145# TODO(jhylton): Make this work with the same global opener.
146_urlopener = None
147def urlretrieve(url, filename=None, reporthook=None, data=None):
148 global _urlopener
149 if not _urlopener:
150 _urlopener = FancyURLopener()
151 return _urlopener.retrieve(url, filename, reporthook, data)
152
153def urlcleanup():
154 if _urlopener:
155 _urlopener.cleanup()
156 global _opener
157 if _opener:
158 _opener = None
159
160# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000161_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000162def request_host(request):
163 """Return request-host, as defined by RFC 2965.
164
165 Variation from RFC: returned value is lowercased, for convenient
166 comparison.
167
168 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000169 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000170 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000171 if host == "":
172 host = request.get_header("Host", "")
173
174 # remove port, if present
175 host = _cut_port_re.sub("", host, 1)
176 return host.lower()
177
178class Request:
179
180 def __init__(self, url, data=None, headers={},
181 origin_req_host=None, unverifiable=False):
182 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Senthil Kumaran45ce4dc2012-07-08 02:08:48 -0700183 self.full_url = unwrap(url)
Senthil Kumaran26430412011-04-13 07:01:19 +0800184 self.full_url, self.fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185 self.data = data
186 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000187 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 for key, value in headers.items():
189 self.add_header(key, value)
190 self.unredirected_hdrs = {}
191 if origin_req_host is None:
192 origin_req_host = request_host(self)
193 self.origin_req_host = origin_req_host
194 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000195 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000196
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000197 def _parse(self):
198 self.type, rest = splittype(self.full_url)
199 if self.type is None:
200 raise ValueError("unknown url type: %s" % self.full_url)
201 self.host, self.selector = splithost(rest)
202 if self.host:
203 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000204
205 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000206 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000207 return "POST"
208 else:
209 return "GET"
210
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000211 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000212
213 def add_data(self, data):
214 self.data = data
215
216 def has_data(self):
217 return self.data is not None
218
219 def get_data(self):
220 return self.data
221
222 def get_full_url(self):
Senthil Kumaran26430412011-04-13 07:01:19 +0800223 if self.fragment:
224 return '%s#%s' % (self.full_url, self.fragment)
225 else:
226 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000227
228 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000229 return self.type
230
231 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000232 return self.host
233
234 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000235 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000237 def is_unverifiable(self):
238 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000239
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000240 def get_origin_req_host(self):
241 return self.origin_req_host
242
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000243 # End deprecated methods
244
245 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000246 if self.type == 'https' and not self._tunnel_host:
247 self._tunnel_host = self.host
248 else:
249 self.type= type
250 self.selector = self.full_url
251 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000252
253 def has_proxy(self):
254 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000255
256 def add_header(self, key, val):
257 # useful for something like authentication
258 self.headers[key.capitalize()] = val
259
260 def add_unredirected_header(self, key, val):
261 # will not be added to a redirected request
262 self.unredirected_hdrs[key.capitalize()] = val
263
264 def has_header(self, header_name):
265 return (header_name in self.headers or
266 header_name in self.unredirected_hdrs)
267
268 def get_header(self, header_name, default=None):
269 return self.headers.get(
270 header_name,
271 self.unredirected_hdrs.get(header_name, default))
272
273 def header_items(self):
274 hdrs = self.unredirected_hdrs.copy()
275 hdrs.update(self.headers)
276 return list(hdrs.items())
277
278class OpenerDirector:
279 def __init__(self):
280 client_version = "Python-urllib/%s" % __version__
281 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000282 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000283 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000284 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000285 self.handle_open = {}
286 self.handle_error = {}
287 self.process_response = {}
288 self.process_request = {}
289
290 def add_handler(self, handler):
291 if not hasattr(handler, "add_parent"):
292 raise TypeError("expected BaseHandler instance, got %r" %
293 type(handler))
294
295 added = False
296 for meth in dir(handler):
297 if meth in ["redirect_request", "do_open", "proxy_open"]:
298 # oops, coincidental match
299 continue
300
301 i = meth.find("_")
302 protocol = meth[:i]
303 condition = meth[i+1:]
304
305 if condition.startswith("error"):
306 j = condition.find("_") + i + 1
307 kind = meth[j+1:]
308 try:
309 kind = int(kind)
310 except ValueError:
311 pass
312 lookup = self.handle_error.get(protocol, {})
313 self.handle_error[protocol] = lookup
314 elif condition == "open":
315 kind = protocol
316 lookup = self.handle_open
317 elif condition == "response":
318 kind = protocol
319 lookup = self.process_response
320 elif condition == "request":
321 kind = protocol
322 lookup = self.process_request
323 else:
324 continue
325
326 handlers = lookup.setdefault(kind, [])
327 if handlers:
328 bisect.insort(handlers, handler)
329 else:
330 handlers.append(handler)
331 added = True
332
333 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000334 bisect.insort(self.handlers, handler)
335 handler.add_parent(self)
336
337 def close(self):
338 # Only exists for backwards compatibility.
339 pass
340
341 def _call_chain(self, chain, kind, meth_name, *args):
342 # Handlers raise an exception if no one else should try to handle
343 # the request, or return None if they can't but another handler
344 # could. Otherwise, they return the response.
345 handlers = chain.get(kind, ())
346 for handler in handlers:
347 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000348 result = func(*args)
349 if result is not None:
350 return result
351
352 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
353 # accept a URL or a Request object
354 if isinstance(fullurl, str):
355 req = Request(fullurl, data)
356 else:
357 req = fullurl
358 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000359 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000360
361 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000362 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000363
364 # pre-process request
365 meth_name = protocol+"_request"
366 for processor in self.process_request.get(protocol, []):
367 meth = getattr(processor, meth_name)
368 req = meth(req)
369
370 response = self._open(req, data)
371
372 # post-process response
373 meth_name = protocol+"_response"
374 for processor in self.process_response.get(protocol, []):
375 meth = getattr(processor, meth_name)
376 response = meth(req, response)
377
378 return response
379
380 def _open(self, req, data=None):
381 result = self._call_chain(self.handle_open, 'default',
382 'default_open', req)
383 if result:
384 return result
385
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000386 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000387 result = self._call_chain(self.handle_open, protocol, protocol +
388 '_open', req)
389 if result:
390 return result
391
392 return self._call_chain(self.handle_open, 'unknown',
393 'unknown_open', req)
394
395 def error(self, proto, *args):
396 if proto in ('http', 'https'):
397 # XXX http[s] protocols are special-cased
398 dict = self.handle_error['http'] # https is not different than http
399 proto = args[2] # YUCK!
400 meth_name = 'http_error_%s' % proto
401 http_err = 1
402 orig_args = args
403 else:
404 dict = self.handle_error
405 meth_name = proto + '_error'
406 http_err = 0
407 args = (dict, proto, meth_name) + args
408 result = self._call_chain(*args)
409 if result:
410 return result
411
412 if http_err:
413 args = (dict, 'default', 'http_error_default') + orig_args
414 return self._call_chain(*args)
415
416# XXX probably also want an abstract factory that knows when it makes
417# sense to skip a superclass in favor of a subclass and when it might
418# make sense to include both
419
420def build_opener(*handlers):
421 """Create an opener object from a list of handlers.
422
423 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000424 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000425
426 If any of the handlers passed as arguments are subclasses of the
427 default handlers, the default handlers will not be used.
428 """
429 def isclass(obj):
430 return isinstance(obj, type) or hasattr(obj, "__bases__")
431
432 opener = OpenerDirector()
433 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
434 HTTPDefaultErrorHandler, HTTPRedirectHandler,
435 FTPHandler, FileHandler, HTTPErrorProcessor]
436 if hasattr(http.client, "HTTPSConnection"):
437 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000438 skip = set()
439 for klass in default_classes:
440 for check in handlers:
441 if isclass(check):
442 if issubclass(check, klass):
443 skip.add(klass)
444 elif isinstance(check, klass):
445 skip.add(klass)
446 for klass in skip:
447 default_classes.remove(klass)
448
449 for klass in default_classes:
450 opener.add_handler(klass())
451
452 for h in handlers:
453 if isclass(h):
454 h = h()
455 opener.add_handler(h)
456 return opener
457
458class BaseHandler:
459 handler_order = 500
460
461 def add_parent(self, parent):
462 self.parent = parent
463
464 def close(self):
465 # Only exists for backwards compatibility
466 pass
467
468 def __lt__(self, other):
469 if not hasattr(other, "handler_order"):
470 # Try to preserve the old behavior of having custom classes
471 # inserted after default ones (works only for custom user
472 # classes which are not aware of handler_order).
473 return True
474 return self.handler_order < other.handler_order
475
476
477class HTTPErrorProcessor(BaseHandler):
478 """Process HTTP error responses."""
479 handler_order = 1000 # after all other processing
480
481 def http_response(self, request, response):
482 code, msg, hdrs = response.code, response.msg, response.info()
483
484 # According to RFC 2616, "2xx" code indicates that the client's
485 # request was successfully received, understood, and accepted.
486 if not (200 <= code < 300):
487 response = self.parent.error(
488 'http', request, response, code, msg, hdrs)
489
490 return response
491
492 https_response = http_response
493
494class HTTPDefaultErrorHandler(BaseHandler):
495 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000496 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000497
498class HTTPRedirectHandler(BaseHandler):
499 # maximum number of redirections to any single URL
500 # this is needed because of the state that cookies introduce
501 max_repeats = 4
502 # maximum total number of redirections (regardless of URL) before
503 # assuming we're in a loop
504 max_redirections = 10
505
506 def redirect_request(self, req, fp, code, msg, headers, newurl):
507 """Return a Request or None in response to a redirect.
508
509 This is called by the http_error_30x methods when a
510 redirection response is received. If a redirection should
511 take place, return a new Request to allow http_error_30x to
512 perform the redirect. Otherwise, raise HTTPError if no-one
513 else should try to handle this url. Return None if you can't
514 but another Handler might.
515 """
516 m = req.get_method()
517 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
518 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000519 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000520
521 # Strictly (according to RFC 2616), 301 or 302 in response to
522 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000523 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000524 # essentially all clients do redirect in this case, so we do
525 # the same.
526 # be conciliant with URIs containing a space
527 newurl = newurl.replace(' ', '%20')
528 CONTENT_HEADERS = ("content-length", "content-type")
529 newheaders = dict((k, v) for k, v in req.headers.items()
530 if k.lower() not in CONTENT_HEADERS)
531 return Request(newurl,
532 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000533 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000534 unverifiable=True)
535
536 # Implementation note: To avoid the server sending us into an
537 # infinite loop, the request object needs to track what URLs we
538 # have already seen. Do this by adding a handler-specific
539 # attribute to the Request object.
540 def http_error_302(self, req, fp, code, msg, headers):
541 # Some servers (incorrectly) return multiple Location headers
542 # (so probably same goes for URI). Use first header.
543 if "location" in headers:
544 newurl = headers["location"]
545 elif "uri" in headers:
546 newurl = headers["uri"]
547 else:
548 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000549
550 # fix a possible malformed URL
551 urlparts = urlparse(newurl)
guido@google.coma119df92011-03-29 11:41:02 -0700552
553 # For security reasons we don't allow redirection to anything other
554 # than http, https or ftp.
555
Senthil Kumaran6497aa32012-01-04 13:46:59 +0800556 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800557 raise HTTPError(
558 newurl, code,
559 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
560 headers, fp)
guido@google.coma119df92011-03-29 11:41:02 -0700561
Facundo Batistaf24802c2008-08-17 03:36:03 +0000562 if not urlparts.path:
563 urlparts = list(urlparts)
564 urlparts[2] = "/"
565 newurl = urlunparse(urlparts)
566
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000567 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000568
569 # XXX Probably want to forget about the state of the current
570 # request, although that might interact poorly with other
571 # handlers that also use handler-specific request attributes
572 new = self.redirect_request(req, fp, code, msg, headers, newurl)
573 if new is None:
574 return
575
576 # loop detection
577 # .redirect_dict has a key url if url was previously visited.
578 if hasattr(req, 'redirect_dict'):
579 visited = new.redirect_dict = req.redirect_dict
580 if (visited.get(newurl, 0) >= self.max_repeats or
581 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000582 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000583 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000584 else:
585 visited = new.redirect_dict = req.redirect_dict = {}
586 visited[newurl] = visited.get(newurl, 0) + 1
587
588 # Don't close the fp until we are sure that we won't use it
589 # with HTTPError.
590 fp.read()
591 fp.close()
592
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000593 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000594
595 http_error_301 = http_error_303 = http_error_307 = http_error_302
596
597 inf_msg = "The HTTP server returned a redirect error that would " \
598 "lead to an infinite loop.\n" \
599 "The last 30x error message was:\n"
600
601
602def _parse_proxy(proxy):
603 """Return (scheme, user, password, host/port) given a URL or an authority.
604
605 If a URL is supplied, it must have an authority (host:port) component.
606 According to RFC 3986, having an authority component means the URL must
607 have two slashes after the scheme:
608
609 >>> _parse_proxy('file:/ftp.example.com/')
610 Traceback (most recent call last):
611 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
612
613 The first three items of the returned tuple may be None.
614
615 Examples of authority parsing:
616
617 >>> _parse_proxy('proxy.example.com')
618 (None, None, None, 'proxy.example.com')
619 >>> _parse_proxy('proxy.example.com:3128')
620 (None, None, None, 'proxy.example.com:3128')
621
622 The authority component may optionally include userinfo (assumed to be
623 username:password):
624
625 >>> _parse_proxy('joe:password@proxy.example.com')
626 (None, 'joe', 'password', 'proxy.example.com')
627 >>> _parse_proxy('joe:password@proxy.example.com:3128')
628 (None, 'joe', 'password', 'proxy.example.com:3128')
629
630 Same examples, but with URLs instead:
631
632 >>> _parse_proxy('http://proxy.example.com/')
633 ('http', None, None, 'proxy.example.com')
634 >>> _parse_proxy('http://proxy.example.com:3128/')
635 ('http', None, None, 'proxy.example.com:3128')
636 >>> _parse_proxy('http://joe:password@proxy.example.com/')
637 ('http', 'joe', 'password', 'proxy.example.com')
638 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
639 ('http', 'joe', 'password', 'proxy.example.com:3128')
640
641 Everything after the authority is ignored:
642
643 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
644 ('ftp', 'joe', 'password', 'proxy.example.com')
645
646 Test for no trailing '/' case:
647
648 >>> _parse_proxy('http://joe:password@proxy.example.com')
649 ('http', 'joe', 'password', 'proxy.example.com')
650
651 """
Georg Brandl13e89462008-07-01 19:56:00 +0000652 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000653 if not r_scheme.startswith("/"):
654 # authority
655 scheme = None
656 authority = proxy
657 else:
658 # URL
659 if not r_scheme.startswith("//"):
660 raise ValueError("proxy URL with no authority: %r" % proxy)
661 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
662 # and 3.3.), path is empty or starts with '/'
663 end = r_scheme.find("/", 2)
664 if end == -1:
665 end = None
666 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000667 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000668 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000669 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000670 else:
671 user = password = None
672 return scheme, user, password, hostport
673
674class ProxyHandler(BaseHandler):
675 # Proxies must be in front
676 handler_order = 100
677
678 def __init__(self, proxies=None):
679 if proxies is None:
680 proxies = getproxies()
681 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
682 self.proxies = proxies
683 for type, url in proxies.items():
684 setattr(self, '%s_open' % type,
685 lambda r, proxy=url, type=type, meth=self.proxy_open: \
686 meth(r, proxy, type))
687
688 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000689 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000690 proxy_type, user, password, hostport = _parse_proxy(proxy)
691 if proxy_type is None:
692 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000693
694 if req.host and proxy_bypass(req.host):
695 return None
696
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000697 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000698 user_pass = '%s:%s' % (unquote(user),
699 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000700 creds = base64.b64encode(user_pass.encode()).decode("ascii")
701 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000702 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000703 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000704 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000705 # let other handlers take care of it
706 return None
707 else:
708 # need to start over, because the other handlers don't
709 # grok the proxy's URL type
710 # e.g. if we have a constructor arg proxies like so:
711 # {'http': 'ftp://proxy.example.com'}, we may end up turning
712 # a request for http://acme.example.com/a into one for
713 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000714 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000715
716class HTTPPasswordMgr:
717
718 def __init__(self):
719 self.passwd = {}
720
721 def add_password(self, realm, uri, user, passwd):
722 # uri could be a single URI or a sequence
723 if isinstance(uri, str):
724 uri = [uri]
Senthil Kumaran34d38dc2011-10-20 02:48:01 +0800725 if realm not in self.passwd:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000726 self.passwd[realm] = {}
727 for default_port in True, False:
728 reduced_uri = tuple(
729 [self.reduce_uri(u, default_port) for u in uri])
730 self.passwd[realm][reduced_uri] = (user, passwd)
731
732 def find_user_password(self, realm, authuri):
733 domains = self.passwd.get(realm, {})
734 for default_port in True, False:
735 reduced_authuri = self.reduce_uri(authuri, default_port)
736 for uris, authinfo in domains.items():
737 for uri in uris:
738 if self.is_suburi(uri, reduced_authuri):
739 return authinfo
740 return None, None
741
742 def reduce_uri(self, uri, default_port=True):
743 """Accept authority or URI and extract only the authority and path."""
744 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000745 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000746 if parts[1]:
747 # URI
748 scheme = parts[0]
749 authority = parts[1]
750 path = parts[2] or '/'
751 else:
752 # host or host:port
753 scheme = None
754 authority = uri
755 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000756 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000757 if default_port and port is None and scheme is not None:
758 dport = {"http": 80,
759 "https": 443,
760 }.get(scheme)
761 if dport is not None:
762 authority = "%s:%d" % (host, dport)
763 return authority, path
764
765 def is_suburi(self, base, test):
766 """Check if test is below base in a URI tree
767
768 Both args must be URIs in reduced form.
769 """
770 if base == test:
771 return True
772 if base[0] != test[0]:
773 return False
774 common = posixpath.commonprefix((base[1], test[1]))
775 if len(common) == len(base[1]):
776 return True
777 return False
778
779
780class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
781
782 def find_user_password(self, realm, authuri):
783 user, password = HTTPPasswordMgr.find_user_password(self, realm,
784 authuri)
785 if user is not None:
786 return user, password
787 return HTTPPasswordMgr.find_user_password(self, None, authuri)
788
789
790class AbstractBasicAuthHandler:
791
792 # XXX this allows for multiple auth-schemes, but will stupidly pick
793 # the last one with a realm specified.
794
795 # allow for double- and single-quoted realm values
796 # (single quotes are a violation of the RFC, but appear in the wild)
797 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
Senthil Kumaran34f3fcc2012-05-15 22:30:25 +0800798 'realm=(["\']?)([^"\']*)\\2', re.I)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000799
800 # XXX could pre-emptively send auth info already accepted (RFC 2617,
801 # end of section 2, and section 1.2 immediately after "credentials"
802 # production).
803
804 def __init__(self, password_mgr=None):
805 if password_mgr is None:
806 password_mgr = HTTPPasswordMgr()
807 self.passwd = password_mgr
808 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000809 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000810
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000811 def reset_retry_count(self):
812 self.retried = 0
813
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000814 def http_error_auth_reqed(self, authreq, host, req, headers):
815 # host may be an authority (without userinfo) or a URL with an
816 # authority
817 # XXX could be multiple headers
818 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000819
820 if self.retried > 5:
821 # retry sending the username:password 5 times before failing.
822 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
823 headers, None)
824 else:
825 self.retried += 1
826
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000827 if authreq:
828 mo = AbstractBasicAuthHandler.rx.search(authreq)
829 if mo:
830 scheme, quote, realm = mo.groups()
Senthil Kumaran0ea91cb2012-05-15 23:59:42 +0800831 if quote not in ["'", '"']:
832 warnings.warn("Basic Auth Realm was unquoted",
833 UserWarning, 2)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000834 if scheme.lower() == 'basic':
Senthil Kumaran4bb5c272010-08-26 06:16:22 +0000835 response = self.retry_http_basic_auth(host, req, realm)
836 if response and response.code != 401:
837 self.retried = 0
838 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000839
840 def retry_http_basic_auth(self, host, req, realm):
841 user, pw = self.passwd.find_user_password(realm, host)
842 if pw is not None:
843 raw = "%s:%s" % (user, pw)
844 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
845 if req.headers.get(self.auth_header, None) == auth:
846 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000847 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000848 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000849 else:
850 return None
851
852
853class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
854
855 auth_header = 'Authorization'
856
857 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000858 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000859 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000860 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000861 self.reset_retry_count()
862 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000863
864
865class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
866
867 auth_header = 'Proxy-authorization'
868
869 def http_error_407(self, req, fp, code, msg, headers):
870 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000871 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000872 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
873 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000874 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000875 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000876 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000877 self.reset_retry_count()
878 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000879
880
881def randombytes(n):
882 """Return n random bytes."""
883 return os.urandom(n)
884
885class AbstractDigestAuthHandler:
886 # Digest authentication is specified in RFC 2617.
887
888 # XXX The client does not inspect the Authentication-Info header
889 # in a successful response.
890
891 # XXX It should be possible to test this implementation against
892 # a mock server that just generates a static set of challenges.
893
894 # XXX qop="auth-int" supports is shaky
895
896 def __init__(self, passwd=None):
897 if passwd is None:
898 passwd = HTTPPasswordMgr()
899 self.passwd = passwd
900 self.add_password = self.passwd.add_password
901 self.retried = 0
902 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000903 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000904
905 def reset_retry_count(self):
906 self.retried = 0
907
908 def http_error_auth_reqed(self, auth_header, host, req, headers):
909 authreq = headers.get(auth_header, None)
910 if self.retried > 5:
911 # Don't fail endlessly - if we failed once, we'll probably
912 # fail a second time. Hm. Unless the Password Manager is
913 # prompting for the information. Crap. This isn't great
914 # but it's better than the current 'repeat until recursion
915 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000916 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000917 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000918 else:
919 self.retried += 1
920 if authreq:
921 scheme = authreq.split()[0]
922 if scheme.lower() == 'digest':
923 return self.retry_http_digest_auth(req, authreq)
924
925 def retry_http_digest_auth(self, req, auth):
926 token, challenge = auth.split(' ', 1)
927 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
928 auth = self.get_authorization(req, chal)
929 if auth:
930 auth_val = 'Digest %s' % auth
931 if req.headers.get(self.auth_header, None) == auth_val:
932 return None
933 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000934 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000935 return resp
936
937 def get_cnonce(self, nonce):
938 # The cnonce-value is an opaque
939 # quoted string value provided by the client and used by both client
940 # and server to avoid chosen plaintext attacks, to provide mutual
941 # authentication, and to provide some message integrity protection.
942 # This isn't a fabulous effort, but it's probably Good Enough.
943 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
944 b = s.encode("ascii") + randombytes(8)
945 dig = hashlib.sha1(b).hexdigest()
946 return dig[:16]
947
948 def get_authorization(self, req, chal):
949 try:
950 realm = chal['realm']
951 nonce = chal['nonce']
952 qop = chal.get('qop')
953 algorithm = chal.get('algorithm', 'MD5')
954 # mod_digest doesn't send an opaque, even though it isn't
955 # supposed to be optional
956 opaque = chal.get('opaque', None)
957 except KeyError:
958 return None
959
960 H, KD = self.get_algorithm_impls(algorithm)
961 if H is None:
962 return None
963
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000964 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000965 if user is None:
966 return None
967
968 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000969 if req.data is not None:
970 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000971 else:
972 entdig = None
973
974 A1 = "%s:%s:%s" % (user, realm, pw)
975 A2 = "%s:%s" % (req.get_method(),
976 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000977 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000978 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000979 if nonce == self.last_nonce:
980 self.nonce_count += 1
981 else:
982 self.nonce_count = 1
983 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000984 ncvalue = '%08x' % self.nonce_count
985 cnonce = self.get_cnonce(nonce)
986 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
987 respdig = KD(H(A1), noncebit)
988 elif qop is None:
989 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
990 else:
991 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000992 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000993
994 # XXX should the partial digests be encoded too?
995
996 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000997 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000998 respdig)
999 if opaque:
1000 base += ', opaque="%s"' % opaque
1001 if entdig:
1002 base += ', digest="%s"' % entdig
1003 base += ', algorithm="%s"' % algorithm
1004 if qop:
1005 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1006 return base
1007
1008 def get_algorithm_impls(self, algorithm):
1009 # lambdas assume digest modules are imported at the top level
1010 if algorithm == 'MD5':
1011 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1012 elif algorithm == 'SHA':
1013 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1014 # XXX MD5-sess
1015 KD = lambda s, d: H("%s:%s" % (s, d))
1016 return H, KD
1017
1018 def get_entity_digest(self, data, chal):
1019 # XXX not implemented yet
1020 return None
1021
1022
1023class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1024 """An authentication protocol defined by RFC 2069
1025
1026 Digest authentication improves on basic authentication because it
1027 does not transmit passwords in the clear.
1028 """
1029
1030 auth_header = 'Authorization'
1031 handler_order = 490 # before Basic auth
1032
1033 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001034 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001035 retry = self.http_error_auth_reqed('www-authenticate',
1036 host, req, headers)
1037 self.reset_retry_count()
1038 return retry
1039
1040
1041class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1042
1043 auth_header = 'Proxy-Authorization'
1044 handler_order = 490 # before Basic auth
1045
1046 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001047 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001048 retry = self.http_error_auth_reqed('proxy-authenticate',
1049 host, req, headers)
1050 self.reset_retry_count()
1051 return retry
1052
1053class AbstractHTTPHandler(BaseHandler):
1054
1055 def __init__(self, debuglevel=0):
1056 self._debuglevel = debuglevel
1057
1058 def set_http_debuglevel(self, level):
1059 self._debuglevel = level
1060
1061 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001062 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001063 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001064 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001065
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001066 if request.data is not None: # POST
1067 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001068 if isinstance(data, str):
Georg Brandl496660c2012-06-24 20:01:05 +02001069 msg = "POST data should be bytes or an iterable of bytes. "\
1070 "It cannot be of type str."
Senthil Kumaran6b3434a2012-03-15 18:11:16 -07001071 raise TypeError(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001072 if not request.has_header('Content-type'):
1073 request.add_unredirected_header(
1074 'Content-type',
1075 'application/x-www-form-urlencoded')
1076 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001077 try:
1078 mv = memoryview(data)
1079 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001080 if isinstance(data, collections.Iterable):
Georg Brandl61536042011-02-03 07:46:41 +00001081 raise ValueError("Content-Length should be specified "
1082 "for iterable data of type %r %r" % (type(data),
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001083 data))
1084 else:
1085 request.add_unredirected_header(
Senthil Kumaran1e991f22010-12-24 04:03:59 +00001086 'Content-length', '%d' % (len(mv) * mv.itemsize))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001087
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001088 sel_host = host
1089 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001090 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001091 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001092 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001093 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001094 for name, value in self.parent.addheaders:
1095 name = name.capitalize()
1096 if not request.has_header(name):
1097 request.add_unredirected_header(name, value)
1098
1099 return request
1100
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001101 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001102 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001103
1104 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001105 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001106 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001107 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001108 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001109
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001110 # will parse host:port
1111 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001112
1113 headers = dict(req.unredirected_hdrs)
1114 headers.update(dict((k, v) for k, v in req.headers.items()
1115 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001116
1117 # TODO(jhylton): Should this be redesigned to handle
1118 # persistent connections?
1119
1120 # We want to make an HTTP/1.1 request, but the addinfourl
1121 # class isn't prepared to deal with a persistent connection.
1122 # It will try to read all remaining data from the socket,
1123 # which will block while the server waits for the next request.
1124 # So make sure the connection gets closed after the (only)
1125 # request.
1126 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001127 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001128
1129 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001130 tunnel_headers = {}
1131 proxy_auth_hdr = "Proxy-Authorization"
1132 if proxy_auth_hdr in headers:
1133 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1134 # Proxy-Authorization should not be sent to origin
1135 # server.
1136 del headers[proxy_auth_hdr]
1137 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001138
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001139 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001140 h.request(req.get_method(), req.selector, req.data, headers)
Senthil Kumaran1299a8f2011-07-27 08:05:58 +08001141 except socket.error as err: # timeout error
Senthil Kumaran45686b42011-07-27 09:31:03 +08001142 h.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001143 raise URLError(err)
Senthil Kumaran45686b42011-07-27 09:31:03 +08001144 else:
1145 r = h.getresponse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001146
Senthil Kumaran26430412011-04-13 07:01:19 +08001147 r.url = req.get_full_url()
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001148 # This line replaces the .msg attribute of the HTTPResponse
1149 # with .headers, because urllib clients expect the response to
1150 # have the reason in .msg. It would be good to mark this
1151 # attribute is deprecated and get then to use info() or
1152 # .headers.
1153 r.msg = r.reason
1154 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001155
1156
1157class HTTPHandler(AbstractHTTPHandler):
1158
1159 def http_open(self, req):
1160 return self.do_open(http.client.HTTPConnection, req)
1161
1162 http_request = AbstractHTTPHandler.do_request_
1163
1164if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001165 import ssl
1166
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001167 class HTTPSHandler(AbstractHTTPHandler):
1168
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001169 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1170 AbstractHTTPHandler.__init__(self, debuglevel)
1171 self._context = context
1172 self._check_hostname = check_hostname
1173
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001174 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001175 return self.do_open(http.client.HTTPSConnection, req,
1176 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001177
1178 https_request = AbstractHTTPHandler.do_request_
1179
1180class HTTPCookieProcessor(BaseHandler):
1181 def __init__(self, cookiejar=None):
1182 import http.cookiejar
1183 if cookiejar is None:
1184 cookiejar = http.cookiejar.CookieJar()
1185 self.cookiejar = cookiejar
1186
1187 def http_request(self, request):
1188 self.cookiejar.add_cookie_header(request)
1189 return request
1190
1191 def http_response(self, request, response):
1192 self.cookiejar.extract_cookies(response, request)
1193 return response
1194
1195 https_request = http_request
1196 https_response = http_response
1197
1198class UnknownHandler(BaseHandler):
1199 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001200 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001201 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001202
1203def parse_keqv_list(l):
1204 """Parse list of key=value strings where keys are not duplicated."""
1205 parsed = {}
1206 for elt in l:
1207 k, v = elt.split('=', 1)
1208 if v[0] == '"' and v[-1] == '"':
1209 v = v[1:-1]
1210 parsed[k] = v
1211 return parsed
1212
1213def parse_http_list(s):
1214 """Parse lists as described by RFC 2068 Section 2.
1215
1216 In particular, parse comma-separated lists where the elements of
1217 the list may include quoted-strings. A quoted-string could
1218 contain a comma. A non-quoted string could have quotes in the
1219 middle. Neither commas nor quotes count if they are escaped.
1220 Only double-quotes count, not single-quotes.
1221 """
1222 res = []
1223 part = ''
1224
1225 escape = quote = False
1226 for cur in s:
1227 if escape:
1228 part += cur
1229 escape = False
1230 continue
1231 if quote:
1232 if cur == '\\':
1233 escape = True
1234 continue
1235 elif cur == '"':
1236 quote = False
1237 part += cur
1238 continue
1239
1240 if cur == ',':
1241 res.append(part)
1242 part = ''
1243 continue
1244
1245 if cur == '"':
1246 quote = True
1247
1248 part += cur
1249
1250 # append last part
1251 if part:
1252 res.append(part)
1253
1254 return [part.strip() for part in res]
1255
1256class FileHandler(BaseHandler):
1257 # Use local file or FTP depending on form of URL
1258 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001259 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001260 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1261 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001262 if not req.host is self.get_names():
1263 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001264 else:
1265 return self.open_local_file(req)
1266
1267 # names for the localhost
1268 names = None
1269 def get_names(self):
1270 if FileHandler.names is None:
1271 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001272 FileHandler.names = tuple(
1273 socket.gethostbyname_ex('localhost')[2] +
1274 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001275 except socket.gaierror:
1276 FileHandler.names = (socket.gethostbyname('localhost'),)
1277 return FileHandler.names
1278
1279 # not entirely sure what the rules are here
1280 def open_local_file(self, req):
1281 import email.utils
1282 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001283 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001284 filename = req.selector
1285 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001286 try:
1287 stats = os.stat(localfile)
1288 size = stats.st_size
1289 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001290 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001291 headers = email.message_from_string(
1292 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1293 (mtype or 'text/plain', size, modified))
1294 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001295 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001296 if not host or \
1297 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001298 if host:
1299 origurl = 'file://' + host + filename
1300 else:
1301 origurl = 'file://' + filename
1302 return addinfourl(open(localfile, 'rb'), headers, origurl)
Senthil Kumarancad7b312012-10-27 02:26:46 -07001303 except OSError as exp:
Georg Brandl029986a2008-06-23 11:44:14 +00001304 # users shouldn't expect OSErrors coming from urlopen()
Senthil Kumarancad7b312012-10-27 02:26:46 -07001305 raise URLError(exp)
Georg Brandl13e89462008-07-01 19:56:00 +00001306 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001307
1308def _safe_gethostbyname(host):
1309 try:
1310 return socket.gethostbyname(host)
1311 except socket.gaierror:
1312 return None
1313
1314class FTPHandler(BaseHandler):
1315 def ftp_open(self, req):
1316 import ftplib
1317 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001318 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001319 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001320 raise URLError('ftp error: no host given')
1321 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001322 if port is None:
1323 port = ftplib.FTP_PORT
1324 else:
1325 port = int(port)
1326
1327 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001328 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001329 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001330 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001331 else:
1332 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001333 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001334 user = user or ''
1335 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001336
1337 try:
1338 host = socket.gethostbyname(host)
1339 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001340 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001341 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001342 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001343 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001344 dirs, file = dirs[:-1], dirs[-1]
1345 if dirs and not dirs[0]:
1346 dirs = dirs[1:]
1347 try:
1348 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1349 type = file and 'I' or 'D'
1350 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001351 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001352 if attr.lower() == 'type' and \
1353 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1354 type = value.upper()
1355 fp, retrlen = fw.retrfile(file, type)
1356 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001357 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001358 if mtype:
1359 headers += "Content-type: %s\n" % mtype
1360 if retrlen is not None and retrlen >= 0:
1361 headers += "Content-length: %d\n" % retrlen
1362 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001363 return addinfourl(fp, headers, req.full_url)
Senthil Kumarancad7b312012-10-27 02:26:46 -07001364 except ftplib.all_errors as exp:
1365 exc = URLError('ftp error: %r' % exp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001366 raise exc.with_traceback(sys.exc_info()[2])
1367
1368 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001369 return ftpwrapper(user, passwd, host, port, dirs, timeout,
1370 persistent=False)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001371
1372class CacheFTPHandler(FTPHandler):
1373 # XXX would be nice to have pluggable cache strategies
1374 # XXX this stuff is definitely not thread safe
1375 def __init__(self):
1376 self.cache = {}
1377 self.timeout = {}
1378 self.soonest = 0
1379 self.delay = 60
1380 self.max_conns = 16
1381
1382 def setTimeout(self, t):
1383 self.delay = t
1384
1385 def setMaxConns(self, m):
1386 self.max_conns = m
1387
1388 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1389 key = user, host, port, '/'.join(dirs), timeout
1390 if key in self.cache:
1391 self.timeout[key] = time.time() + self.delay
1392 else:
1393 self.cache[key] = ftpwrapper(user, passwd, host, port,
1394 dirs, timeout)
1395 self.timeout[key] = time.time() + self.delay
1396 self.check_cache()
1397 return self.cache[key]
1398
1399 def check_cache(self):
1400 # first check for old ones
1401 t = time.time()
1402 if self.soonest <= t:
1403 for k, v in list(self.timeout.items()):
1404 if v < t:
1405 self.cache[k].close()
1406 del self.cache[k]
1407 del self.timeout[k]
1408 self.soonest = min(list(self.timeout.values()))
1409
1410 # then check the size
1411 if len(self.cache) == self.max_conns:
1412 for k, v in list(self.timeout.items()):
1413 if v == self.soonest:
1414 del self.cache[k]
1415 del self.timeout[k]
1416 break
1417 self.soonest = min(list(self.timeout.values()))
1418
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02001419 def clear_cache(self):
1420 for conn in self.cache.values():
1421 conn.close()
1422 self.cache.clear()
1423 self.timeout.clear()
1424
1425
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001426# Code move from the old urllib module
1427
1428MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1429
1430# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001431if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001432 from nturl2path import url2pathname, pathname2url
1433else:
1434 def url2pathname(pathname):
1435 """OS-specific conversion from a relative URL of the 'file' scheme
1436 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001437 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001438
1439 def pathname2url(pathname):
1440 """OS-specific conversion from a file system path to a relative URL
1441 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001442 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001443
1444# This really consists of two pieces:
1445# (1) a class which handles opening of all sorts of URLs
1446# (plus assorted utilities etc.)
1447# (2) a set of functions for parsing URLs
1448# XXX Should these be separated out into different modules?
1449
1450
1451ftpcache = {}
1452class URLopener:
1453 """Class to open URLs.
1454 This is a class rather than just a subroutine because we may need
1455 more than one set of global protocol-specific options.
1456 Note -- this is a base class for those who don't want the
1457 automatic handling of errors type 302 (relocated) and 401
1458 (authorization needed)."""
1459
1460 __tempfiles = None
1461
1462 version = "Python-urllib/%s" % __version__
1463
1464 # Constructor
1465 def __init__(self, proxies=None, **x509):
1466 if proxies is None:
1467 proxies = getproxies()
1468 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1469 self.proxies = proxies
1470 self.key_file = x509.get('key_file')
1471 self.cert_file = x509.get('cert_file')
1472 self.addheaders = [('User-Agent', self.version)]
1473 self.__tempfiles = []
1474 self.__unlink = os.unlink # See cleanup()
1475 self.tempcache = None
1476 # Undocumented feature: if you assign {} to tempcache,
1477 # it is used to cache files retrieved with
1478 # self.retrieve(). This is not enabled by default
1479 # since it does not work for changing documents (and I
1480 # haven't got the logic to check expiration headers
1481 # yet).
1482 self.ftpcache = ftpcache
1483 # Undocumented feature: you can use a different
1484 # ftp cache by assigning to the .ftpcache member;
1485 # in case you want logically independent URL openers
1486 # XXX This is not threadsafe. Bah.
1487
1488 def __del__(self):
1489 self.close()
1490
1491 def close(self):
1492 self.cleanup()
1493
1494 def cleanup(self):
1495 # This code sometimes runs when the rest of this module
1496 # has already been deleted, so it can't use any globals
1497 # or import anything.
1498 if self.__tempfiles:
1499 for file in self.__tempfiles:
1500 try:
1501 self.__unlink(file)
1502 except OSError:
1503 pass
1504 del self.__tempfiles[:]
1505 if self.tempcache:
1506 self.tempcache.clear()
1507
1508 def addheader(self, *args):
1509 """Add a header to be used by the HTTP interface only
1510 e.g. u.addheader('Accept', 'sound/basic')"""
1511 self.addheaders.append(args)
1512
1513 # External interface
1514 def open(self, fullurl, data=None):
1515 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001516 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001517 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001518 if self.tempcache and fullurl in self.tempcache:
1519 filename, headers = self.tempcache[fullurl]
1520 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001521 return addinfourl(fp, headers, fullurl)
1522 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001523 if not urltype:
1524 urltype = 'file'
1525 if urltype in self.proxies:
1526 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001527 urltype, proxyhost = splittype(proxy)
1528 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001529 url = (host, fullurl) # Signal special case to open_*()
1530 else:
1531 proxy = None
1532 name = 'open_' + urltype
1533 self.type = urltype
1534 name = name.replace('-', '_')
1535 if not hasattr(self, name):
1536 if proxy:
1537 return self.open_unknown_proxy(proxy, fullurl, data)
1538 else:
1539 return self.open_unknown(fullurl, data)
1540 try:
1541 if data is None:
1542 return getattr(self, name)(url)
1543 else:
1544 return getattr(self, name)(url, data)
1545 except socket.error as msg:
1546 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1547
1548 def open_unknown(self, fullurl, data=None):
1549 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001550 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001551 raise IOError('url error', 'unknown url type', type)
1552
1553 def open_unknown_proxy(self, proxy, fullurl, data=None):
1554 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001555 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001556 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1557
1558 # External interface
1559 def retrieve(self, url, filename=None, reporthook=None, data=None):
1560 """retrieve(url) returns (filename, headers) for a local object
1561 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001562 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001563 if self.tempcache and url in self.tempcache:
1564 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001565 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001566 if filename is None and (not type or type == 'file'):
1567 try:
1568 fp = self.open_local_file(url1)
1569 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001570 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001571 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001572 except IOError as msg:
1573 pass
1574 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001575 try:
1576 headers = fp.info()
1577 if filename:
1578 tfp = open(filename, 'wb')
1579 else:
1580 import tempfile
1581 garbage, path = splittype(url)
1582 garbage, path = splithost(path or "")
1583 path, garbage = splitquery(path or "")
1584 path, garbage = splitattr(path or "")
1585 suffix = os.path.splitext(path)[1]
1586 (fd, filename) = tempfile.mkstemp(suffix)
1587 self.__tempfiles.append(filename)
1588 tfp = os.fdopen(fd, 'wb')
1589 try:
1590 result = filename, headers
1591 if self.tempcache is not None:
1592 self.tempcache[url] = result
1593 bs = 1024*8
1594 size = -1
1595 read = 0
1596 blocknum = 0
Senthil Kumarance260142011-11-01 01:35:17 +08001597 if "content-length" in headers:
1598 size = int(headers["Content-Length"])
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001599 if reporthook:
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001600 reporthook(blocknum, bs, size)
1601 while 1:
1602 block = fp.read(bs)
1603 if not block:
1604 break
1605 read += len(block)
1606 tfp.write(block)
1607 blocknum += 1
1608 if reporthook:
1609 reporthook(blocknum, bs, size)
1610 finally:
1611 tfp.close()
1612 finally:
1613 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001614
1615 # raise exception if actual size does not match content-length header
1616 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001617 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001618 "retrieval incomplete: got only %i out of %i bytes"
1619 % (read, size), result)
1620
1621 return result
1622
1623 # Each method named open_<type> knows how to open that type of URL
1624
1625 def _open_generic_http(self, connection_factory, url, data):
1626 """Make an HTTP connection using connection_class.
1627
1628 This is an internal method that should be called from
1629 open_http() or open_https().
1630
1631 Arguments:
1632 - connection_factory should take a host name and return an
1633 HTTPConnection instance.
1634 - url is the url to retrieval or a host, relative-path pair.
1635 - data is payload for a POST request or None.
1636 """
1637
1638 user_passwd = None
1639 proxy_passwd= None
1640 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001641 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001642 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001643 user_passwd, host = splituser(host)
1644 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001645 realhost = host
1646 else:
1647 host, selector = url
1648 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001649 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001650 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001651 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001652 url = rest
1653 user_passwd = None
1654 if urltype.lower() != 'http':
1655 realhost = None
1656 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001657 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001658 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001659 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001660 if user_passwd:
1661 selector = "%s://%s%s" % (urltype, realhost, rest)
1662 if proxy_bypass(realhost):
1663 host = realhost
1664
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001665 if not host: raise IOError('http error', 'no host given')
1666
1667 if proxy_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001668 proxy_passwd = unquote(proxy_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001669 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001670 else:
1671 proxy_auth = None
1672
1673 if user_passwd:
Senthil Kumaranc5c5a142012-01-14 19:09:04 +08001674 user_passwd = unquote(user_passwd)
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001675 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001676 else:
1677 auth = None
1678 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001679 headers = {}
1680 if proxy_auth:
1681 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1682 if auth:
1683 headers["Authorization"] = "Basic %s" % auth
1684 if realhost:
1685 headers["Host"] = realhost
Senthil Kumarand91ffca2011-03-19 17:25:27 +08001686
1687 # Add Connection:close as we don't support persistent connections yet.
1688 # This helps in closing the socket and avoiding ResourceWarning
1689
1690 headers["Connection"] = "close"
1691
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001692 for header, value in self.addheaders:
1693 headers[header] = value
1694
1695 if data is not None:
1696 headers["Content-Type"] = "application/x-www-form-urlencoded"
1697 http_conn.request("POST", selector, data, headers)
1698 else:
1699 http_conn.request("GET", selector, headers=headers)
1700
1701 try:
1702 response = http_conn.getresponse()
1703 except http.client.BadStatusLine:
1704 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001705 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001706
1707 # According to RFC 2616, "2xx" code indicates that the client's
1708 # request was successfully received, understood, and accepted.
1709 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001710 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001711 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001712 else:
1713 return self.http_error(
1714 url, response.fp,
1715 response.status, response.reason, response.msg, data)
1716
1717 def open_http(self, url, data=None):
1718 """Use HTTP protocol."""
1719 return self._open_generic_http(http.client.HTTPConnection, url, data)
1720
1721 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1722 """Handle http errors.
1723
1724 Derived class can override this, or provide specific handlers
1725 named http_error_DDD where DDD is the 3-digit error code."""
1726 # First check if there's a specific handler for this error
1727 name = 'http_error_%d' % errcode
1728 if hasattr(self, name):
1729 method = getattr(self, name)
1730 if data is None:
1731 result = method(url, fp, errcode, errmsg, headers)
1732 else:
1733 result = method(url, fp, errcode, errmsg, headers, data)
1734 if result: return result
1735 return self.http_error_default(url, fp, errcode, errmsg, headers)
1736
1737 def http_error_default(self, url, fp, errcode, errmsg, headers):
1738 """Default error handler: close the connection and raise IOError."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001739 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001740 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001741
1742 if _have_ssl:
1743 def _https_connection(self, host):
1744 return http.client.HTTPSConnection(host,
1745 key_file=self.key_file,
1746 cert_file=self.cert_file)
1747
1748 def open_https(self, url, data=None):
1749 """Use HTTPS protocol."""
1750 return self._open_generic_http(self._https_connection, url, data)
1751
1752 def open_file(self, url):
1753 """Use local file or FTP depending on form of URL."""
1754 if not isinstance(url, str):
Senthil Kumarancad7b312012-10-27 02:26:46 -07001755 raise URLError('file error: proxy support for file protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001756 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001757 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001758 else:
1759 return self.open_local_file(url)
1760
1761 def open_local_file(self, url):
1762 """Use local file."""
1763 import mimetypes, email.utils
1764 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001765 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001766 localname = url2pathname(file)
1767 try:
1768 stats = os.stat(localname)
1769 except OSError as e:
Senthil Kumarancad7b312012-10-27 02:26:46 -07001770 raise URLError(e.strerror, e.filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001771 size = stats.st_size
1772 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1773 mtype = mimetypes.guess_type(url)[0]
1774 headers = email.message_from_string(
1775 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1776 (mtype or 'text/plain', size, modified))
1777 if not host:
1778 urlfile = file
1779 if file[:1] == '/':
1780 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001781 return addinfourl(open(localname, 'rb'), headers, urlfile)
1782 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001783 if (not port
Senthil Kumarancad7b312012-10-27 02:26:46 -07001784 and socket.gethostbyname(host) in ((localhost(),) + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001785 urlfile = file
1786 if file[:1] == '/':
1787 urlfile = 'file://' + file
Senthil Kumaran3800ea92012-01-21 11:52:48 +08001788 elif file[:2] == './':
1789 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
Georg Brandl13e89462008-07-01 19:56:00 +00001790 return addinfourl(open(localname, 'rb'), headers, urlfile)
Senthil Kumarancad7b312012-10-27 02:26:46 -07001791 raise URLError('local file error: not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001792
1793 def open_ftp(self, url):
1794 """Use FTP protocol."""
1795 if not isinstance(url, str):
Senthil Kumarancad7b312012-10-27 02:26:46 -07001796 raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001797 import mimetypes
Georg Brandl13e89462008-07-01 19:56:00 +00001798 host, path = splithost(url)
Senthil Kumarancad7b312012-10-27 02:26:46 -07001799 if not host: raise URLError('ftp error: no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001800 host, port = splitport(host)
1801 user, host = splituser(host)
1802 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001803 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001804 host = unquote(host)
1805 user = unquote(user or '')
1806 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001807 host = socket.gethostbyname(host)
1808 if not port:
1809 import ftplib
1810 port = ftplib.FTP_PORT
1811 else:
1812 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001813 path, attrs = splitattr(path)
1814 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001815 dirs = path.split('/')
1816 dirs, file = dirs[:-1], dirs[-1]
1817 if dirs and not dirs[0]: dirs = dirs[1:]
1818 if dirs and not dirs[0]: dirs[0] = '/'
1819 key = user, host, port, '/'.join(dirs)
1820 # XXX thread unsafe!
1821 if len(self.ftpcache) > MAXFTPCACHE:
1822 # Prune the cache, rather arbitrarily
1823 for k in self.ftpcache.keys():
1824 if k != key:
1825 v = self.ftpcache[k]
1826 del self.ftpcache[k]
1827 v.close()
1828 try:
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001829 if key not in self.ftpcache:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001830 self.ftpcache[key] = \
1831 ftpwrapper(user, passwd, host, port, dirs)
1832 if not file: type = 'D'
1833 else: type = 'I'
1834 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001835 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001836 if attr.lower() == 'type' and \
1837 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1838 type = value.upper()
1839 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1840 mtype = mimetypes.guess_type("ftp:" + url)[0]
1841 headers = ""
1842 if mtype:
1843 headers += "Content-Type: %s\n" % mtype
1844 if retrlen is not None and retrlen >= 0:
1845 headers += "Content-Length: %d\n" % retrlen
1846 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001847 return addinfourl(fp, headers, "ftp:" + url)
Senthil Kumarancad7b312012-10-27 02:26:46 -07001848 except ftperrors() as exp:
1849 raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001850
1851 def open_data(self, url, data=None):
1852 """Use "data" URL."""
1853 if not isinstance(url, str):
Senthil Kumarancad7b312012-10-27 02:26:46 -07001854 raise URLError('data error: proxy support for data protocol currently not implemented')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001855 # ignore POSTed data
1856 #
1857 # syntax of data URLs:
1858 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1859 # mediatype := [ type "/" subtype ] *( ";" parameter )
1860 # data := *urlchar
1861 # parameter := attribute "=" value
1862 try:
1863 [type, data] = url.split(',', 1)
1864 except ValueError:
1865 raise IOError('data error', 'bad data URL')
1866 if not type:
1867 type = 'text/plain;charset=US-ASCII'
1868 semi = type.rfind(';')
1869 if semi >= 0 and '=' not in type[semi:]:
1870 encoding = type[semi+1:]
1871 type = type[:semi]
1872 else:
1873 encoding = ''
1874 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001875 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001876 time.gmtime(time.time())))
1877 msg.append('Content-type: %s' % type)
1878 if encoding == 'base64':
Georg Brandl706824f2009-06-04 09:42:55 +00001879 # XXX is this encoding/decoding ok?
1880 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001881 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001882 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001883 msg.append('Content-Length: %d' % len(data))
1884 msg.append('')
1885 msg.append(data)
1886 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001887 headers = email.message_from_string(msg)
1888 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001889 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001890 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001891
1892
1893class FancyURLopener(URLopener):
1894 """Derived class with handlers for errors we can handle (perhaps)."""
1895
1896 def __init__(self, *args, **kwargs):
1897 URLopener.__init__(self, *args, **kwargs)
1898 self.auth_cache = {}
1899 self.tries = 0
1900 self.maxtries = 10
1901
1902 def http_error_default(self, url, fp, errcode, errmsg, headers):
1903 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001904 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001905
1906 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1907 """Error 302 -- relocated (temporarily)."""
1908 self.tries += 1
1909 if self.maxtries and self.tries >= self.maxtries:
1910 if hasattr(self, "http_error_500"):
1911 meth = self.http_error_500
1912 else:
1913 meth = self.http_error_default
1914 self.tries = 0
1915 return meth(url, fp, 500,
1916 "Internal Server Error: Redirect Recursion", headers)
1917 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1918 data)
1919 self.tries = 0
1920 return result
1921
1922 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1923 if 'location' in headers:
1924 newurl = headers['location']
1925 elif 'uri' in headers:
1926 newurl = headers['uri']
1927 else:
1928 return
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001929 fp.close()
guido@google.coma119df92011-03-29 11:41:02 -07001930
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001931 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001932 newurl = urljoin(self.type + ":" + url, newurl)
guido@google.coma119df92011-03-29 11:41:02 -07001933
1934 urlparts = urlparse(newurl)
1935
1936 # For security reasons, we don't allow redirection to anything other
1937 # than http, https and ftp.
1938
1939 # We are using newer HTTPError with older redirect_internal method
1940 # This older method will get deprecated in 3.3
1941
Senthil Kumaran6497aa32012-01-04 13:46:59 +08001942 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
guido@google.coma119df92011-03-29 11:41:02 -07001943 raise HTTPError(newurl, errcode,
1944 errmsg +
1945 " Redirection to url '%s' is not allowed." % newurl,
1946 headers, fp)
1947
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001948 return self.open(newurl)
1949
1950 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1951 """Error 301 -- also relocated (permanently)."""
1952 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1953
1954 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1955 """Error 303 -- also relocated (essentially identical to 302)."""
1956 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1957
1958 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1959 """Error 307 -- relocated, but turn POST into error."""
1960 if data is None:
1961 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1962 else:
1963 return self.http_error_default(url, fp, errcode, errmsg, headers)
1964
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001965 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1966 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001967 """Error 401 -- authentication required.
1968 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001969 if 'www-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001970 URLopener.http_error_default(self, url, fp,
1971 errcode, errmsg, headers)
1972 stuff = headers['www-authenticate']
1973 import re
1974 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1975 if not match:
1976 URLopener.http_error_default(self, url, fp,
1977 errcode, errmsg, headers)
1978 scheme, realm = match.groups()
1979 if scheme.lower() != 'basic':
1980 URLopener.http_error_default(self, url, fp,
1981 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001982 if not retry:
1983 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1984 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001985 name = 'retry_' + self.type + '_basic_auth'
1986 if data is None:
1987 return getattr(self,name)(url, realm)
1988 else:
1989 return getattr(self,name)(url, realm, data)
1990
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001991 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
1992 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001993 """Error 407 -- proxy authentication required.
1994 This function supports Basic authentication only."""
Senthil Kumaran34d38dc2011-10-20 02:48:01 +08001995 if 'proxy-authenticate' not in headers:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001996 URLopener.http_error_default(self, url, fp,
1997 errcode, errmsg, headers)
1998 stuff = headers['proxy-authenticate']
1999 import re
2000 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2001 if not match:
2002 URLopener.http_error_default(self, url, fp,
2003 errcode, errmsg, headers)
2004 scheme, realm = match.groups()
2005 if scheme.lower() != 'basic':
2006 URLopener.http_error_default(self, url, fp,
2007 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00002008 if not retry:
2009 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2010 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002011 name = 'retry_proxy_' + self.type + '_basic_auth'
2012 if data is None:
2013 return getattr(self,name)(url, realm)
2014 else:
2015 return getattr(self,name)(url, realm, data)
2016
2017 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002018 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002019 newurl = 'http://' + host + selector
2020 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00002021 urltype, proxyhost = splittype(proxy)
2022 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002023 i = proxyhost.find('@') + 1
2024 proxyhost = proxyhost[i:]
2025 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2026 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002027 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002028 quote(passwd, safe=''), proxyhost)
2029 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2030 if data is None:
2031 return self.open(newurl)
2032 else:
2033 return self.open(newurl, data)
2034
2035 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002036 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002037 newurl = 'https://' + host + selector
2038 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00002039 urltype, proxyhost = splittype(proxy)
2040 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002041 i = proxyhost.find('@') + 1
2042 proxyhost = proxyhost[i:]
2043 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2044 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002045 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002046 quote(passwd, safe=''), proxyhost)
2047 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2048 if data is None:
2049 return self.open(newurl)
2050 else:
2051 return self.open(newurl, data)
2052
2053 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002054 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002055 i = host.find('@') + 1
2056 host = host[i:]
2057 user, passwd = self.get_user_passwd(host, realm, i)
2058 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002059 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002060 quote(passwd, safe=''), host)
2061 newurl = 'http://' + host + selector
2062 if data is None:
2063 return self.open(newurl)
2064 else:
2065 return self.open(newurl, data)
2066
2067 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002068 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002069 i = host.find('@') + 1
2070 host = host[i:]
2071 user, passwd = self.get_user_passwd(host, realm, i)
2072 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002073 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002074 quote(passwd, safe=''), host)
2075 newurl = 'https://' + host + selector
2076 if data is None:
2077 return self.open(newurl)
2078 else:
2079 return self.open(newurl, data)
2080
Florent Xicluna757445b2010-05-17 17:24:07 +00002081 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002082 key = realm + '@' + host.lower()
2083 if key in self.auth_cache:
2084 if clear_cache:
2085 del self.auth_cache[key]
2086 else:
2087 return self.auth_cache[key]
2088 user, passwd = self.prompt_user_passwd(host, realm)
2089 if user or passwd: self.auth_cache[key] = (user, passwd)
2090 return user, passwd
2091
2092 def prompt_user_passwd(self, host, realm):
2093 """Override this in a GUI environment!"""
2094 import getpass
2095 try:
2096 user = input("Enter username for %s at %s: " % (realm, host))
2097 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2098 (user, realm, host))
2099 return user, passwd
2100 except KeyboardInterrupt:
2101 print()
2102 return None, None
2103
2104
2105# Utility functions
2106
2107_localhost = None
2108def localhost():
2109 """Return the IP address of the magic hostname 'localhost'."""
2110 global _localhost
2111 if _localhost is None:
2112 _localhost = socket.gethostbyname('localhost')
2113 return _localhost
2114
2115_thishost = None
2116def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002117 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002118 global _thishost
2119 if _thishost is None:
Senthil Kumaran1b7da512011-10-06 00:32:02 +08002120 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002121 return _thishost
2122
2123_ftperrors = None
2124def ftperrors():
2125 """Return the set of errors raised by the FTP class."""
2126 global _ftperrors
2127 if _ftperrors is None:
2128 import ftplib
2129 _ftperrors = ftplib.all_errors
2130 return _ftperrors
2131
2132_noheaders = None
2133def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002134 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002135 global _noheaders
2136 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002137 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002138 return _noheaders
2139
2140
2141# Utility classes
2142
2143class ftpwrapper:
2144 """Class used by open_ftp() for cache of open FTP connections."""
2145
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002146 def __init__(self, user, passwd, host, port, dirs, timeout=None,
2147 persistent=True):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002148 self.user = user
2149 self.passwd = passwd
2150 self.host = host
2151 self.port = port
2152 self.dirs = dirs
2153 self.timeout = timeout
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002154 self.refcount = 0
2155 self.keepalive = persistent
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002156 self.init()
2157
2158 def init(self):
2159 import ftplib
2160 self.busy = 0
2161 self.ftp = ftplib.FTP()
2162 self.ftp.connect(self.host, self.port, self.timeout)
2163 self.ftp.login(self.user, self.passwd)
2164 for dir in self.dirs:
2165 self.ftp.cwd(dir)
2166
2167 def retrfile(self, file, type):
2168 import ftplib
2169 self.endtransfer()
2170 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2171 else: cmd = 'TYPE ' + type; isdir = 0
2172 try:
2173 self.ftp.voidcmd(cmd)
2174 except ftplib.all_errors:
2175 self.init()
2176 self.ftp.voidcmd(cmd)
2177 conn = None
2178 if file and not isdir:
2179 # Try to retrieve as a file
2180 try:
2181 cmd = 'RETR ' + file
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002182 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002183 except ftplib.error_perm as reason:
2184 if str(reason)[:3] != '550':
Senthil Kumarancad7b312012-10-27 02:26:46 -07002185 raise URLError('ftp error: %d' % reason).with_traceback(
Georg Brandl13e89462008-07-01 19:56:00 +00002186 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002187 if not conn:
2188 # Set transfer mode to ASCII!
2189 self.ftp.voidcmd('TYPE A')
2190 # Try a directory listing. Verify that directory exists.
2191 if file:
2192 pwd = self.ftp.pwd()
2193 try:
2194 try:
2195 self.ftp.cwd(file)
2196 except ftplib.error_perm as reason:
Senthil Kumarancad7b312012-10-27 02:26:46 -07002197 raise URLError('ftp error: %d' % reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002198 finally:
2199 self.ftp.cwd(pwd)
2200 cmd = 'LIST ' + file
2201 else:
2202 cmd = 'LIST'
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002203 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002204 self.busy = 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002205
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002206 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2207 self.refcount += 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002208 conn.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002209 # Pass back both a suitably decorated object and a retrieval length
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002210 return (ftpobj, retrlen)
2211
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002212 def endtransfer(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002213 self.busy = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002214
2215 def close(self):
Nadeem Vawda08f5f7a2011-07-23 14:03:00 +02002216 self.keepalive = False
2217 if self.refcount <= 0:
2218 self.real_close()
2219
2220 def file_close(self):
2221 self.endtransfer()
2222 self.refcount -= 1
2223 if self.refcount <= 0 and not self.keepalive:
2224 self.real_close()
2225
2226 def real_close(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002227 self.endtransfer()
2228 try:
2229 self.ftp.close()
2230 except ftperrors():
2231 pass
2232
2233# Proxy handling
2234def getproxies_environment():
2235 """Return a dictionary of scheme -> proxy server URL mappings.
2236
2237 Scan the environment for variables named <scheme>_proxy;
2238 this seems to be the standard convention. If you need a
2239 different way, you can pass a proxies dictionary to the
2240 [Fancy]URLopener constructor.
2241
2242 """
2243 proxies = {}
2244 for name, value in os.environ.items():
2245 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002246 if value and name[-6:] == '_proxy':
2247 proxies[name[:-6]] = value
2248 return proxies
2249
2250def proxy_bypass_environment(host):
2251 """Test if proxies should not be used for a particular host.
2252
2253 Checks the environment for a variable named no_proxy, which should
2254 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2255 """
2256 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2257 # '*' is special case for always bypass
2258 if no_proxy == '*':
2259 return 1
2260 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002261 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002262 # check if the host ends with any of the DNS suffixes
Senthil Kumaran89976f12011-08-06 12:27:40 +08002263 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2264 for name in no_proxy_list:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002265 if name and (hostonly.endswith(name) or host.endswith(name)):
2266 return 1
2267 # otherwise, don't bypass
2268 return 0
2269
2270
Ronald Oussorene72e1612011-03-14 18:15:25 -04002271# This code tests an OSX specific data structure but is testable on all
2272# platforms
2273def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2274 """
2275 Return True iff this host shouldn't be accessed using a proxy
2276
2277 This function uses the MacOSX framework SystemConfiguration
2278 to fetch the proxy information.
2279
2280 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2281 { 'exclude_simple': bool,
2282 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2283 }
2284 """
2285 import re
2286 import socket
2287 from fnmatch import fnmatch
2288
2289 hostonly, port = splitport(host)
2290
2291 def ip2num(ipAddr):
2292 parts = ipAddr.split('.')
2293 parts = list(map(int, parts))
2294 if len(parts) != 4:
2295 parts = (parts + [0, 0, 0, 0])[:4]
2296 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2297
2298 # Check for simple host names:
2299 if '.' not in host:
2300 if proxy_settings['exclude_simple']:
2301 return True
2302
2303 hostIP = None
2304
2305 for value in proxy_settings.get('exceptions', ()):
2306 # Items in the list are strings like these: *.local, 169.254/16
2307 if not value: continue
2308
2309 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2310 if m is not None:
2311 if hostIP is None:
2312 try:
2313 hostIP = socket.gethostbyname(hostonly)
2314 hostIP = ip2num(hostIP)
2315 except socket.error:
2316 continue
2317
2318 base = ip2num(m.group(1))
2319 mask = m.group(2)
2320 if mask is None:
2321 mask = 8 * (m.group(1).count('.') + 1)
2322 else:
2323 mask = int(mask[1:])
2324 mask = 32 - mask
2325
2326 if (hostIP >> mask) == (base >> mask):
2327 return True
2328
2329 elif fnmatch(host, value):
2330 return True
2331
2332 return False
2333
2334
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002335if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002336 from _scproxy import _get_proxy_settings, _get_proxies
2337
2338 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren84151202010-04-18 20:46:11 +00002339 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002340 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren84151202010-04-18 20:46:11 +00002341
2342 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002343 """Return a dictionary of scheme -> proxy server URL mappings.
2344
Ronald Oussoren84151202010-04-18 20:46:11 +00002345 This function uses the MacOSX framework SystemConfiguration
2346 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002347 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002348 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002349
Ronald Oussoren84151202010-04-18 20:46:11 +00002350
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002351
2352 def proxy_bypass(host):
2353 if getproxies_environment():
2354 return proxy_bypass_environment(host)
2355 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002356 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002357
2358 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002359 return getproxies_environment() or getproxies_macosx_sysconf()
2360
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002361
2362elif os.name == 'nt':
2363 def getproxies_registry():
2364 """Return a dictionary of scheme -> proxy server URL mappings.
2365
2366 Win32 uses the registry to store proxies.
2367
2368 """
2369 proxies = {}
2370 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002371 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002372 except ImportError:
2373 # Std module, so should be around - but you never know!
2374 return proxies
2375 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002376 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002377 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002378 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002379 'ProxyEnable')[0]
2380 if proxyEnable:
2381 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002382 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002383 'ProxyServer')[0])
2384 if '=' in proxyServer:
2385 # Per-protocol settings
2386 for p in proxyServer.split(';'):
2387 protocol, address = p.split('=', 1)
2388 # See if address has a type:// prefix
2389 import re
2390 if not re.match('^([^/:]+)://', address):
2391 address = '%s://%s' % (protocol, address)
2392 proxies[protocol] = address
2393 else:
2394 # Use one setting for all protocols
2395 if proxyServer[:5] == 'http:':
2396 proxies['http'] = proxyServer
2397 else:
2398 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002399 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002400 proxies['ftp'] = 'ftp://%s' % proxyServer
2401 internetSettings.Close()
2402 except (WindowsError, ValueError, TypeError):
2403 # Either registry key not found etc, or the value in an
2404 # unexpected format.
2405 # proxies already set up to be empty so nothing to do
2406 pass
2407 return proxies
2408
2409 def getproxies():
2410 """Return a dictionary of scheme -> proxy server URL mappings.
2411
2412 Returns settings gathered from the environment, if specified,
2413 or the registry.
2414
2415 """
2416 return getproxies_environment() or getproxies_registry()
2417
2418 def proxy_bypass_registry(host):
2419 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002420 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002421 import re
2422 except ImportError:
2423 # Std modules, so should be around - but you never know!
2424 return 0
2425 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002426 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002427 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002428 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002429 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002430 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002431 'ProxyOverride')[0])
2432 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2433 except WindowsError:
2434 return 0
2435 if not proxyEnable or not proxyOverride:
2436 return 0
2437 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002438 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002439 host = [rawHost]
2440 try:
2441 addr = socket.gethostbyname(rawHost)
2442 if addr != rawHost:
2443 host.append(addr)
2444 except socket.error:
2445 pass
2446 try:
2447 fqdn = socket.getfqdn(rawHost)
2448 if fqdn != rawHost:
2449 host.append(fqdn)
2450 except socket.error:
2451 pass
2452 # make a check value list from the registry entry: replace the
2453 # '<local>' string by the localhost entry and the corresponding
2454 # canonical entry.
2455 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002456 # now check if we match one of the registry values.
2457 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002458 if test == '<local>':
2459 if '.' not in rawHost:
2460 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002461 test = test.replace(".", r"\.") # mask dots
2462 test = test.replace("*", r".*") # change glob sequence
2463 test = test.replace("?", r".") # change glob char
2464 for val in host:
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002465 if re.match(test, val, re.I):
2466 return 1
2467 return 0
2468
2469 def proxy_bypass(host):
2470 """Return a dictionary of scheme -> proxy server URL mappings.
2471
2472 Returns settings gathered from the environment, if specified,
2473 or the registry.
2474
2475 """
2476 if getproxies_environment():
2477 return proxy_bypass_environment(host)
2478 else:
2479 return proxy_bypass_registry(host)
2480
2481else:
2482 # By default use environment variables
2483 getproxies = getproxies_environment
2484 proxy_bypass = proxy_bypass_environment