blob: e13381c3bca248a7bb4d5c19e5a345c69a228a47 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran04454cd2009-11-15 07:27:02 +000033
Senthil Kumaran4b9fbeb2009-12-20 07:18:22 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097
Georg Brandl13e89462008-07-01 19:56:00 +000098from urllib.error import URLError, HTTPError, ContentTooShortError
99from urllib.parse import (
100 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
101 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumaran4c88db72010-08-08 11:30:58 +0000102 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000103from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000104
105# check for SSL
106try:
107 import ssl
Senthil Kumarand17ebdb2010-11-22 04:53:57 +0000108except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000109 _have_ssl = False
110else:
111 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000112
113# used in User-Agent header sent
114__version__ = sys.version[:3]
115
116_opener = None
117def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
118 global _opener
119 if _opener is None:
120 _opener = build_opener()
121 return _opener.open(url, data, timeout)
122
123def install_opener(opener):
124 global _opener
125 _opener = opener
126
127# TODO(jhylton): Make this work with the same global opener.
128_urlopener = None
129def urlretrieve(url, filename=None, reporthook=None, data=None):
130 global _urlopener
131 if not _urlopener:
132 _urlopener = FancyURLopener()
133 return _urlopener.retrieve(url, filename, reporthook, data)
134
135def urlcleanup():
136 if _urlopener:
137 _urlopener.cleanup()
138 global _opener
139 if _opener:
140 _opener = None
141
142# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000143_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000144def request_host(request):
145 """Return request-host, as defined by RFC 2965.
146
147 Variation from RFC: returned value is lowercased, for convenient
148 comparison.
149
150 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000151 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000152 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000153 if host == "":
154 host = request.get_header("Host", "")
155
156 # remove port, if present
157 host = _cut_port_re.sub("", host, 1)
158 return host.lower()
159
160class Request:
161
162 def __init__(self, url, data=None, headers={},
163 origin_req_host=None, unverifiable=False):
164 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000165 self.full_url = unwrap(url)
Senthil Kumaran4c88db72010-08-08 11:30:58 +0000166 self.full_url, fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000167 self.data = data
168 self.headers = {}
Senthil Kumaran0ac1f832009-07-26 12:39:47 +0000169 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000170 for key, value in headers.items():
171 self.add_header(key, value)
172 self.unredirected_hdrs = {}
173 if origin_req_host is None:
174 origin_req_host = request_host(self)
175 self.origin_req_host = origin_req_host
176 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000177 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000178
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000179 def _parse(self):
180 self.type, rest = splittype(self.full_url)
181 if self.type is None:
182 raise ValueError("unknown url type: %s" % self.full_url)
183 self.host, self.selector = splithost(rest)
184 if self.host:
185 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000186
187 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000188 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000189 return "POST"
190 else:
191 return "GET"
192
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000193 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000194
195 def add_data(self, data):
196 self.data = data
197
198 def has_data(self):
199 return self.data is not None
200
201 def get_data(self):
202 return self.data
203
204 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000205 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000206
207 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000208 return self.type
209
210 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211 return self.host
212
213 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000214 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000215
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000216 def is_unverifiable(self):
217 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000218
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000219 def get_origin_req_host(self):
220 return self.origin_req_host
221
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000222 # End deprecated methods
223
224 def set_proxy(self, host, type):
Senthil Kumaran0ac1f832009-07-26 12:39:47 +0000225 if self.type == 'https' and not self._tunnel_host:
226 self._tunnel_host = self.host
227 else:
228 self.type= type
229 self.selector = self.full_url
230 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000231
232 def has_proxy(self):
233 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000234
235 def add_header(self, key, val):
236 # useful for something like authentication
237 self.headers[key.capitalize()] = val
238
239 def add_unredirected_header(self, key, val):
240 # will not be added to a redirected request
241 self.unredirected_hdrs[key.capitalize()] = val
242
243 def has_header(self, header_name):
244 return (header_name in self.headers or
245 header_name in self.unredirected_hdrs)
246
247 def get_header(self, header_name, default=None):
248 return self.headers.get(
249 header_name,
250 self.unredirected_hdrs.get(header_name, default))
251
252 def header_items(self):
253 hdrs = self.unredirected_hdrs.copy()
254 hdrs.update(self.headers)
255 return list(hdrs.items())
256
257class OpenerDirector:
258 def __init__(self):
259 client_version = "Python-urllib/%s" % __version__
260 self.addheaders = [('User-agent', client_version)]
R. David Murray46c6fd62010-12-23 19:47:37 +0000261 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000262 self.handlers = []
R. David Murray46c6fd62010-12-23 19:47:37 +0000263 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000264 self.handle_open = {}
265 self.handle_error = {}
266 self.process_response = {}
267 self.process_request = {}
268
269 def add_handler(self, handler):
270 if not hasattr(handler, "add_parent"):
271 raise TypeError("expected BaseHandler instance, got %r" %
272 type(handler))
273
274 added = False
275 for meth in dir(handler):
276 if meth in ["redirect_request", "do_open", "proxy_open"]:
277 # oops, coincidental match
278 continue
279
280 i = meth.find("_")
281 protocol = meth[:i]
282 condition = meth[i+1:]
283
284 if condition.startswith("error"):
285 j = condition.find("_") + i + 1
286 kind = meth[j+1:]
287 try:
288 kind = int(kind)
289 except ValueError:
290 pass
291 lookup = self.handle_error.get(protocol, {})
292 self.handle_error[protocol] = lookup
293 elif condition == "open":
294 kind = protocol
295 lookup = self.handle_open
296 elif condition == "response":
297 kind = protocol
298 lookup = self.process_response
299 elif condition == "request":
300 kind = protocol
301 lookup = self.process_request
302 else:
303 continue
304
305 handlers = lookup.setdefault(kind, [])
306 if handlers:
307 bisect.insort(handlers, handler)
308 else:
309 handlers.append(handler)
310 added = True
311
312 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000313 bisect.insort(self.handlers, handler)
314 handler.add_parent(self)
315
316 def close(self):
317 # Only exists for backwards compatibility.
318 pass
319
320 def _call_chain(self, chain, kind, meth_name, *args):
321 # Handlers raise an exception if no one else should try to handle
322 # the request, or return None if they can't but another handler
323 # could. Otherwise, they return the response.
324 handlers = chain.get(kind, ())
325 for handler in handlers:
326 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000327 result = func(*args)
328 if result is not None:
329 return result
330
331 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
332 # accept a URL or a Request object
333 if isinstance(fullurl, str):
334 req = Request(fullurl, data)
335 else:
336 req = fullurl
337 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000338 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000339
340 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000341 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000342
343 # pre-process request
344 meth_name = protocol+"_request"
345 for processor in self.process_request.get(protocol, []):
346 meth = getattr(processor, meth_name)
347 req = meth(req)
348
349 response = self._open(req, data)
350
351 # post-process response
352 meth_name = protocol+"_response"
353 for processor in self.process_response.get(protocol, []):
354 meth = getattr(processor, meth_name)
355 response = meth(req, response)
356
357 return response
358
359 def _open(self, req, data=None):
360 result = self._call_chain(self.handle_open, 'default',
361 'default_open', req)
362 if result:
363 return result
364
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000365 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000366 result = self._call_chain(self.handle_open, protocol, protocol +
367 '_open', req)
368 if result:
369 return result
370
371 return self._call_chain(self.handle_open, 'unknown',
372 'unknown_open', req)
373
374 def error(self, proto, *args):
375 if proto in ('http', 'https'):
376 # XXX http[s] protocols are special-cased
377 dict = self.handle_error['http'] # https is not different than http
378 proto = args[2] # YUCK!
379 meth_name = 'http_error_%s' % proto
380 http_err = 1
381 orig_args = args
382 else:
383 dict = self.handle_error
384 meth_name = proto + '_error'
385 http_err = 0
386 args = (dict, proto, meth_name) + args
387 result = self._call_chain(*args)
388 if result:
389 return result
390
391 if http_err:
392 args = (dict, 'default', 'http_error_default') + orig_args
393 return self._call_chain(*args)
394
395# XXX probably also want an abstract factory that knows when it makes
396# sense to skip a superclass in favor of a subclass and when it might
397# make sense to include both
398
399def build_opener(*handlers):
400 """Create an opener object from a list of handlers.
401
402 The opener will use several default handlers, including support
Senthil Kumaran04454cd2009-11-15 07:27:02 +0000403 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000404
405 If any of the handlers passed as arguments are subclasses of the
406 default handlers, the default handlers will not be used.
407 """
408 def isclass(obj):
409 return isinstance(obj, type) or hasattr(obj, "__bases__")
410
411 opener = OpenerDirector()
412 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
413 HTTPDefaultErrorHandler, HTTPRedirectHandler,
414 FTPHandler, FileHandler, HTTPErrorProcessor]
415 if hasattr(http.client, "HTTPSConnection"):
416 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000417 skip = set()
418 for klass in default_classes:
419 for check in handlers:
420 if isclass(check):
421 if issubclass(check, klass):
422 skip.add(klass)
423 elif isinstance(check, klass):
424 skip.add(klass)
425 for klass in skip:
426 default_classes.remove(klass)
427
428 for klass in default_classes:
429 opener.add_handler(klass())
430
431 for h in handlers:
432 if isclass(h):
433 h = h()
434 opener.add_handler(h)
435 return opener
436
437class BaseHandler:
438 handler_order = 500
439
440 def add_parent(self, parent):
441 self.parent = parent
442
443 def close(self):
444 # Only exists for backwards compatibility
445 pass
446
447 def __lt__(self, other):
448 if not hasattr(other, "handler_order"):
449 # Try to preserve the old behavior of having custom classes
450 # inserted after default ones (works only for custom user
451 # classes which are not aware of handler_order).
452 return True
453 return self.handler_order < other.handler_order
454
455
456class HTTPErrorProcessor(BaseHandler):
457 """Process HTTP error responses."""
458 handler_order = 1000 # after all other processing
459
460 def http_response(self, request, response):
461 code, msg, hdrs = response.code, response.msg, response.info()
462
463 # According to RFC 2616, "2xx" code indicates that the client's
464 # request was successfully received, understood, and accepted.
465 if not (200 <= code < 300):
466 response = self.parent.error(
467 'http', request, response, code, msg, hdrs)
468
469 return response
470
471 https_response = http_response
472
473class HTTPDefaultErrorHandler(BaseHandler):
474 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000475 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000476
477class HTTPRedirectHandler(BaseHandler):
478 # maximum number of redirections to any single URL
479 # this is needed because of the state that cookies introduce
480 max_repeats = 4
481 # maximum total number of redirections (regardless of URL) before
482 # assuming we're in a loop
483 max_redirections = 10
484
485 def redirect_request(self, req, fp, code, msg, headers, newurl):
486 """Return a Request or None in response to a redirect.
487
488 This is called by the http_error_30x methods when a
489 redirection response is received. If a redirection should
490 take place, return a new Request to allow http_error_30x to
491 perform the redirect. Otherwise, raise HTTPError if no-one
492 else should try to handle this url. Return None if you can't
493 but another Handler might.
494 """
495 m = req.get_method()
496 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
497 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000498 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000499
500 # Strictly (according to RFC 2616), 301 or 302 in response to
501 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000502 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000503 # essentially all clients do redirect in this case, so we do
504 # the same.
505 # be conciliant with URIs containing a space
506 newurl = newurl.replace(' ', '%20')
507 CONTENT_HEADERS = ("content-length", "content-type")
508 newheaders = dict((k, v) for k, v in req.headers.items()
509 if k.lower() not in CONTENT_HEADERS)
510 return Request(newurl,
511 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000512 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000513 unverifiable=True)
514
515 # Implementation note: To avoid the server sending us into an
516 # infinite loop, the request object needs to track what URLs we
517 # have already seen. Do this by adding a handler-specific
518 # attribute to the Request object.
519 def http_error_302(self, req, fp, code, msg, headers):
520 # Some servers (incorrectly) return multiple Location headers
521 # (so probably same goes for URI). Use first header.
522 if "location" in headers:
523 newurl = headers["location"]
524 elif "uri" in headers:
525 newurl = headers["uri"]
526 else:
527 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000528
529 # fix a possible malformed URL
530 urlparts = urlparse(newurl)
531 if not urlparts.path:
532 urlparts = list(urlparts)
533 urlparts[2] = "/"
534 newurl = urlunparse(urlparts)
535
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000536 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000537
538 # XXX Probably want to forget about the state of the current
539 # request, although that might interact poorly with other
540 # handlers that also use handler-specific request attributes
541 new = self.redirect_request(req, fp, code, msg, headers, newurl)
542 if new is None:
543 return
544
545 # loop detection
546 # .redirect_dict has a key url if url was previously visited.
547 if hasattr(req, 'redirect_dict'):
548 visited = new.redirect_dict = req.redirect_dict
549 if (visited.get(newurl, 0) >= self.max_repeats or
550 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000551 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000552 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000553 else:
554 visited = new.redirect_dict = req.redirect_dict = {}
555 visited[newurl] = visited.get(newurl, 0) + 1
556
557 # Don't close the fp until we are sure that we won't use it
558 # with HTTPError.
559 fp.read()
560 fp.close()
561
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000562 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000563
564 http_error_301 = http_error_303 = http_error_307 = http_error_302
565
566 inf_msg = "The HTTP server returned a redirect error that would " \
567 "lead to an infinite loop.\n" \
568 "The last 30x error message was:\n"
569
570
571def _parse_proxy(proxy):
572 """Return (scheme, user, password, host/port) given a URL or an authority.
573
574 If a URL is supplied, it must have an authority (host:port) component.
575 According to RFC 3986, having an authority component means the URL must
576 have two slashes after the scheme:
577
578 >>> _parse_proxy('file:/ftp.example.com/')
579 Traceback (most recent call last):
580 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
581
582 The first three items of the returned tuple may be None.
583
584 Examples of authority parsing:
585
586 >>> _parse_proxy('proxy.example.com')
587 (None, None, None, 'proxy.example.com')
588 >>> _parse_proxy('proxy.example.com:3128')
589 (None, None, None, 'proxy.example.com:3128')
590
591 The authority component may optionally include userinfo (assumed to be
592 username:password):
593
594 >>> _parse_proxy('joe:password@proxy.example.com')
595 (None, 'joe', 'password', 'proxy.example.com')
596 >>> _parse_proxy('joe:password@proxy.example.com:3128')
597 (None, 'joe', 'password', 'proxy.example.com:3128')
598
599 Same examples, but with URLs instead:
600
601 >>> _parse_proxy('http://proxy.example.com/')
602 ('http', None, None, 'proxy.example.com')
603 >>> _parse_proxy('http://proxy.example.com:3128/')
604 ('http', None, None, 'proxy.example.com:3128')
605 >>> _parse_proxy('http://joe:password@proxy.example.com/')
606 ('http', 'joe', 'password', 'proxy.example.com')
607 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
608 ('http', 'joe', 'password', 'proxy.example.com:3128')
609
610 Everything after the authority is ignored:
611
612 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
613 ('ftp', 'joe', 'password', 'proxy.example.com')
614
615 Test for no trailing '/' case:
616
617 >>> _parse_proxy('http://joe:password@proxy.example.com')
618 ('http', 'joe', 'password', 'proxy.example.com')
619
620 """
Georg Brandl13e89462008-07-01 19:56:00 +0000621 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000622 if not r_scheme.startswith("/"):
623 # authority
624 scheme = None
625 authority = proxy
626 else:
627 # URL
628 if not r_scheme.startswith("//"):
629 raise ValueError("proxy URL with no authority: %r" % proxy)
630 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
631 # and 3.3.), path is empty or starts with '/'
632 end = r_scheme.find("/", 2)
633 if end == -1:
634 end = None
635 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000636 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000637 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000638 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000639 else:
640 user = password = None
641 return scheme, user, password, hostport
642
643class ProxyHandler(BaseHandler):
644 # Proxies must be in front
645 handler_order = 100
646
647 def __init__(self, proxies=None):
648 if proxies is None:
649 proxies = getproxies()
650 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
651 self.proxies = proxies
652 for type, url in proxies.items():
653 setattr(self, '%s_open' % type,
654 lambda r, proxy=url, type=type, meth=self.proxy_open: \
655 meth(r, proxy, type))
656
657 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000658 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000659 proxy_type, user, password, hostport = _parse_proxy(proxy)
660 if proxy_type is None:
661 proxy_type = orig_type
Senthil Kumaran11301632009-10-11 06:07:46 +0000662
663 if req.host and proxy_bypass(req.host):
664 return None
665
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000666 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000667 user_pass = '%s:%s' % (unquote(user),
668 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000669 creds = base64.b64encode(user_pass.encode()).decode("ascii")
670 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000671 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000672 req.set_proxy(hostport, proxy_type)
Senthil Kumaran0ac1f832009-07-26 12:39:47 +0000673 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000674 # let other handlers take care of it
675 return None
676 else:
677 # need to start over, because the other handlers don't
678 # grok the proxy's URL type
679 # e.g. if we have a constructor arg proxies like so:
680 # {'http': 'ftp://proxy.example.com'}, we may end up turning
681 # a request for http://acme.example.com/a into one for
682 # ftp://proxy.example.com/a
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000683 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000684
685class HTTPPasswordMgr:
686
687 def __init__(self):
688 self.passwd = {}
689
690 def add_password(self, realm, uri, user, passwd):
691 # uri could be a single URI or a sequence
692 if isinstance(uri, str):
693 uri = [uri]
694 if not realm in self.passwd:
695 self.passwd[realm] = {}
696 for default_port in True, False:
697 reduced_uri = tuple(
698 [self.reduce_uri(u, default_port) for u in uri])
699 self.passwd[realm][reduced_uri] = (user, passwd)
700
701 def find_user_password(self, realm, authuri):
702 domains = self.passwd.get(realm, {})
703 for default_port in True, False:
704 reduced_authuri = self.reduce_uri(authuri, default_port)
705 for uris, authinfo in domains.items():
706 for uri in uris:
707 if self.is_suburi(uri, reduced_authuri):
708 return authinfo
709 return None, None
710
711 def reduce_uri(self, uri, default_port=True):
712 """Accept authority or URI and extract only the authority and path."""
713 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000714 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000715 if parts[1]:
716 # URI
717 scheme = parts[0]
718 authority = parts[1]
719 path = parts[2] or '/'
720 else:
721 # host or host:port
722 scheme = None
723 authority = uri
724 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000725 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000726 if default_port and port is None and scheme is not None:
727 dport = {"http": 80,
728 "https": 443,
729 }.get(scheme)
730 if dport is not None:
731 authority = "%s:%d" % (host, dport)
732 return authority, path
733
734 def is_suburi(self, base, test):
735 """Check if test is below base in a URI tree
736
737 Both args must be URIs in reduced form.
738 """
739 if base == test:
740 return True
741 if base[0] != test[0]:
742 return False
743 common = posixpath.commonprefix((base[1], test[1]))
744 if len(common) == len(base[1]):
745 return True
746 return False
747
748
749class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
750
751 def find_user_password(self, realm, authuri):
752 user, password = HTTPPasswordMgr.find_user_password(self, realm,
753 authuri)
754 if user is not None:
755 return user, password
756 return HTTPPasswordMgr.find_user_password(self, None, authuri)
757
758
759class AbstractBasicAuthHandler:
760
761 # XXX this allows for multiple auth-schemes, but will stupidly pick
762 # the last one with a realm specified.
763
764 # allow for double- and single-quoted realm values
765 # (single quotes are a violation of the RFC, but appear in the wild)
766 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
767 'realm=(["\'])(.*?)\\2', re.I)
768
769 # XXX could pre-emptively send auth info already accepted (RFC 2617,
770 # end of section 2, and section 1.2 immediately after "credentials"
771 # production).
772
773 def __init__(self, password_mgr=None):
774 if password_mgr is None:
775 password_mgr = HTTPPasswordMgr()
776 self.passwd = password_mgr
777 self.add_password = self.passwd.add_password
Senthil Kumaranefafdc72010-06-01 12:56:17 +0000778 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000779
Senthil Kumarancb39d6c2010-08-19 17:54:33 +0000780 def reset_retry_count(self):
781 self.retried = 0
782
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000783 def http_error_auth_reqed(self, authreq, host, req, headers):
784 # host may be an authority (without userinfo) or a URL with an
785 # authority
786 # XXX could be multiple headers
787 authreq = headers.get(authreq, None)
Senthil Kumaranefafdc72010-06-01 12:56:17 +0000788
789 if self.retried > 5:
790 # retry sending the username:password 5 times before failing.
791 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
792 headers, None)
793 else:
794 self.retried += 1
795
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000796 if authreq:
797 mo = AbstractBasicAuthHandler.rx.search(authreq)
798 if mo:
799 scheme, quote, realm = mo.groups()
800 if scheme.lower() == 'basic':
Senthil Kumaran06509382010-08-26 06:24:04 +0000801 response = self.retry_http_basic_auth(host, req, realm)
802 if response and response.code != 401:
803 self.retried = 0
804 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000805
806 def retry_http_basic_auth(self, host, req, realm):
807 user, pw = self.passwd.find_user_password(realm, host)
808 if pw is not None:
809 raw = "%s:%s" % (user, pw)
810 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
811 if req.headers.get(self.auth_header, None) == auth:
812 return None
Senthil Kumaranefcd8832010-02-24 16:56:20 +0000813 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000814 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000815 else:
816 return None
817
818
819class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
820
821 auth_header = 'Authorization'
822
823 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000824 url = req.full_url
Senthil Kumarancb39d6c2010-08-19 17:54:33 +0000825 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000826 url, req, headers)
Senthil Kumarancb39d6c2010-08-19 17:54:33 +0000827 self.reset_retry_count()
828 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000829
830
831class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
832
833 auth_header = 'Proxy-authorization'
834
835 def http_error_407(self, req, fp, code, msg, headers):
836 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000837 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000838 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
839 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000840 authority = req.host
Senthil Kumarancb39d6c2010-08-19 17:54:33 +0000841 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000842 authority, req, headers)
Senthil Kumarancb39d6c2010-08-19 17:54:33 +0000843 self.reset_retry_count()
844 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000845
846
847def randombytes(n):
848 """Return n random bytes."""
849 return os.urandom(n)
850
851class AbstractDigestAuthHandler:
852 # Digest authentication is specified in RFC 2617.
853
854 # XXX The client does not inspect the Authentication-Info header
855 # in a successful response.
856
857 # XXX It should be possible to test this implementation against
858 # a mock server that just generates a static set of challenges.
859
860 # XXX qop="auth-int" supports is shaky
861
862 def __init__(self, passwd=None):
863 if passwd is None:
864 passwd = HTTPPasswordMgr()
865 self.passwd = passwd
866 self.add_password = self.passwd.add_password
867 self.retried = 0
868 self.nonce_count = 0
Senthil Kumaranb58474f2009-11-15 08:45:27 +0000869 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000870
871 def reset_retry_count(self):
872 self.retried = 0
873
874 def http_error_auth_reqed(self, auth_header, host, req, headers):
875 authreq = headers.get(auth_header, None)
876 if self.retried > 5:
877 # Don't fail endlessly - if we failed once, we'll probably
878 # fail a second time. Hm. Unless the Password Manager is
879 # prompting for the information. Crap. This isn't great
880 # but it's better than the current 'repeat until recursion
881 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000882 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000883 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000884 else:
885 self.retried += 1
886 if authreq:
887 scheme = authreq.split()[0]
888 if scheme.lower() == 'digest':
889 return self.retry_http_digest_auth(req, authreq)
890
891 def retry_http_digest_auth(self, req, auth):
892 token, challenge = auth.split(' ', 1)
893 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
894 auth = self.get_authorization(req, chal)
895 if auth:
896 auth_val = 'Digest %s' % auth
897 if req.headers.get(self.auth_header, None) == auth_val:
898 return None
899 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000900 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901 return resp
902
903 def get_cnonce(self, nonce):
904 # The cnonce-value is an opaque
905 # quoted string value provided by the client and used by both client
906 # and server to avoid chosen plaintext attacks, to provide mutual
907 # authentication, and to provide some message integrity protection.
908 # This isn't a fabulous effort, but it's probably Good Enough.
909 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
910 b = s.encode("ascii") + randombytes(8)
911 dig = hashlib.sha1(b).hexdigest()
912 return dig[:16]
913
914 def get_authorization(self, req, chal):
915 try:
916 realm = chal['realm']
917 nonce = chal['nonce']
918 qop = chal.get('qop')
919 algorithm = chal.get('algorithm', 'MD5')
920 # mod_digest doesn't send an opaque, even though it isn't
921 # supposed to be optional
922 opaque = chal.get('opaque', None)
923 except KeyError:
924 return None
925
926 H, KD = self.get_algorithm_impls(algorithm)
927 if H is None:
928 return None
929
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000930 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000931 if user is None:
932 return None
933
934 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000935 if req.data is not None:
936 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000937 else:
938 entdig = None
939
940 A1 = "%s:%s:%s" % (user, realm, pw)
941 A2 = "%s:%s" % (req.get_method(),
942 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000943 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000944 if qop == 'auth':
Senthil Kumaranb58474f2009-11-15 08:45:27 +0000945 if nonce == self.last_nonce:
946 self.nonce_count += 1
947 else:
948 self.nonce_count = 1
949 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000950 ncvalue = '%08x' % self.nonce_count
951 cnonce = self.get_cnonce(nonce)
952 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
953 respdig = KD(H(A1), noncebit)
954 elif qop is None:
955 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
956 else:
957 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000958 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000959
960 # XXX should the partial digests be encoded too?
961
962 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000963 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000964 respdig)
965 if opaque:
966 base += ', opaque="%s"' % opaque
967 if entdig:
968 base += ', digest="%s"' % entdig
969 base += ', algorithm="%s"' % algorithm
970 if qop:
971 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
972 return base
973
974 def get_algorithm_impls(self, algorithm):
975 # lambdas assume digest modules are imported at the top level
976 if algorithm == 'MD5':
977 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
978 elif algorithm == 'SHA':
979 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
980 # XXX MD5-sess
981 KD = lambda s, d: H("%s:%s" % (s, d))
982 return H, KD
983
984 def get_entity_digest(self, data, chal):
985 # XXX not implemented yet
986 return None
987
988
989class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
990 """An authentication protocol defined by RFC 2069
991
992 Digest authentication improves on basic authentication because it
993 does not transmit passwords in the clear.
994 """
995
996 auth_header = 'Authorization'
997 handler_order = 490 # before Basic auth
998
999 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001000 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001001 retry = self.http_error_auth_reqed('www-authenticate',
1002 host, req, headers)
1003 self.reset_retry_count()
1004 return retry
1005
1006
1007class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1008
1009 auth_header = 'Proxy-Authorization'
1010 handler_order = 490 # before Basic auth
1011
1012 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001013 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001014 retry = self.http_error_auth_reqed('proxy-authenticate',
1015 host, req, headers)
1016 self.reset_retry_count()
1017 return retry
1018
1019class AbstractHTTPHandler(BaseHandler):
1020
1021 def __init__(self, debuglevel=0):
1022 self._debuglevel = debuglevel
1023
1024 def set_http_debuglevel(self, level):
1025 self._debuglevel = level
1026
1027 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001028 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001029 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001030 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001031
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001032 if request.data is not None: # POST
1033 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001034 if not request.has_header('Content-type'):
1035 request.add_unredirected_header(
1036 'Content-type',
1037 'application/x-www-form-urlencoded')
1038 if not request.has_header('Content-length'):
1039 request.add_unredirected_header(
1040 'Content-length', '%d' % len(data))
1041
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001042 sel_host = host
1043 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001044 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001045 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001046 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001047 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001048 for name, value in self.parent.addheaders:
1049 name = name.capitalize()
1050 if not request.has_header(name):
1051 request.add_unredirected_header(name, value)
1052
1053 return request
1054
1055 def do_open(self, http_class, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001056 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001057
1058 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001059 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001060 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001061 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001062 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001063
1064 h = http_class(host, timeout=req.timeout) # will parse host:port
Senthil Kumaran603ca412010-09-27 01:28:10 +00001065
1066 headers = dict(req.unredirected_hdrs)
1067 headers.update(dict((k, v) for k, v in req.headers.items()
1068 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001069
1070 # TODO(jhylton): Should this be redesigned to handle
1071 # persistent connections?
1072
1073 # We want to make an HTTP/1.1 request, but the addinfourl
1074 # class isn't prepared to deal with a persistent connection.
1075 # It will try to read all remaining data from the socket,
1076 # which will block while the server waits for the next request.
1077 # So make sure the connection gets closed after the (only)
1078 # request.
1079 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001080 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran0ac1f832009-07-26 12:39:47 +00001081
1082 if req._tunnel_host:
Senthil Kumaran4b9fbeb2009-12-20 07:18:22 +00001083 tunnel_headers = {}
1084 proxy_auth_hdr = "Proxy-Authorization"
1085 if proxy_auth_hdr in headers:
1086 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1087 # Proxy-Authorization should not be sent to origin
1088 # server.
1089 del headers[proxy_auth_hdr]
1090 h._set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran0ac1f832009-07-26 12:39:47 +00001091
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001092 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001093 h.request(req.get_method(), req.selector, req.data, headers)
1094 r = h.getresponse() # an HTTPResponse instance
1095 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001096 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001097
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001098 r.url = req.full_url
1099 # This line replaces the .msg attribute of the HTTPResponse
1100 # with .headers, because urllib clients expect the response to
1101 # have the reason in .msg. It would be good to mark this
1102 # attribute is deprecated and get then to use info() or
1103 # .headers.
1104 r.msg = r.reason
1105 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001106
1107
1108class HTTPHandler(AbstractHTTPHandler):
1109
1110 def http_open(self, req):
1111 return self.do_open(http.client.HTTPConnection, req)
1112
1113 http_request = AbstractHTTPHandler.do_request_
1114
1115if hasattr(http.client, 'HTTPSConnection'):
1116 class HTTPSHandler(AbstractHTTPHandler):
1117
1118 def https_open(self, req):
1119 return self.do_open(http.client.HTTPSConnection, req)
1120
1121 https_request = AbstractHTTPHandler.do_request_
1122
1123class HTTPCookieProcessor(BaseHandler):
1124 def __init__(self, cookiejar=None):
1125 import http.cookiejar
1126 if cookiejar is None:
1127 cookiejar = http.cookiejar.CookieJar()
1128 self.cookiejar = cookiejar
1129
1130 def http_request(self, request):
1131 self.cookiejar.add_cookie_header(request)
1132 return request
1133
1134 def http_response(self, request, response):
1135 self.cookiejar.extract_cookies(response, request)
1136 return response
1137
1138 https_request = http_request
1139 https_response = http_response
1140
1141class UnknownHandler(BaseHandler):
1142 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001143 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001144 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001145
1146def parse_keqv_list(l):
1147 """Parse list of key=value strings where keys are not duplicated."""
1148 parsed = {}
1149 for elt in l:
1150 k, v = elt.split('=', 1)
1151 if v[0] == '"' and v[-1] == '"':
1152 v = v[1:-1]
1153 parsed[k] = v
1154 return parsed
1155
1156def parse_http_list(s):
1157 """Parse lists as described by RFC 2068 Section 2.
1158
1159 In particular, parse comma-separated lists where the elements of
1160 the list may include quoted-strings. A quoted-string could
1161 contain a comma. A non-quoted string could have quotes in the
1162 middle. Neither commas nor quotes count if they are escaped.
1163 Only double-quotes count, not single-quotes.
1164 """
1165 res = []
1166 part = ''
1167
1168 escape = quote = False
1169 for cur in s:
1170 if escape:
1171 part += cur
1172 escape = False
1173 continue
1174 if quote:
1175 if cur == '\\':
1176 escape = True
1177 continue
1178 elif cur == '"':
1179 quote = False
1180 part += cur
1181 continue
1182
1183 if cur == ',':
1184 res.append(part)
1185 part = ''
1186 continue
1187
1188 if cur == '"':
1189 quote = True
1190
1191 part += cur
1192
1193 # append last part
1194 if part:
1195 res.append(part)
1196
1197 return [part.strip() for part in res]
1198
1199class FileHandler(BaseHandler):
1200 # Use local file or FTP depending on form of URL
1201 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001202 url = req.selector
Senthil Kumaran34024142010-07-11 03:15:25 +00001203 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1204 req.host != 'localhost'):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001205 req.type = 'ftp'
1206 return self.parent.open(req)
1207 else:
1208 return self.open_local_file(req)
1209
1210 # names for the localhost
1211 names = None
1212 def get_names(self):
1213 if FileHandler.names is None:
1214 try:
Senthil Kumaran88a495d2009-12-27 10:15:45 +00001215 FileHandler.names = tuple(
1216 socket.gethostbyname_ex('localhost')[2] +
1217 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001218 except socket.gaierror:
1219 FileHandler.names = (socket.gethostbyname('localhost'),)
1220 return FileHandler.names
1221
1222 # not entirely sure what the rules are here
1223 def open_local_file(self, req):
1224 import email.utils
1225 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001226 host = req.host
Senthil Kumaran1e72bd32010-05-08 05:14:29 +00001227 filename = req.selector
1228 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001229 try:
1230 stats = os.stat(localfile)
1231 size = stats.st_size
1232 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran1e72bd32010-05-08 05:14:29 +00001233 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001234 headers = email.message_from_string(
1235 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1236 (mtype or 'text/plain', size, modified))
1237 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001238 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001239 if not host or \
1240 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran1e72bd32010-05-08 05:14:29 +00001241 if host:
1242 origurl = 'file://' + host + filename
1243 else:
1244 origurl = 'file://' + filename
1245 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001246 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001247 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001248 raise URLError(msg)
1249 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001250
1251def _safe_gethostbyname(host):
1252 try:
1253 return socket.gethostbyname(host)
1254 except socket.gaierror:
1255 return None
1256
1257class FTPHandler(BaseHandler):
1258 def ftp_open(self, req):
1259 import ftplib
1260 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001261 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001262 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001263 raise URLError('ftp error: no host given')
1264 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001265 if port is None:
1266 port = ftplib.FTP_PORT
1267 else:
1268 port = int(port)
1269
1270 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001271 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001272 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001273 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001274 else:
1275 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001276 host = unquote(host)
Senthil Kumaran723a7a62010-11-18 16:44:38 +00001277 user = user or ''
1278 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001279
1280 try:
1281 host = socket.gethostbyname(host)
1282 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001283 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001284 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001285 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001286 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001287 dirs, file = dirs[:-1], dirs[-1]
1288 if dirs and not dirs[0]:
1289 dirs = dirs[1:]
1290 try:
1291 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1292 type = file and 'I' or 'D'
1293 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001294 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001295 if attr.lower() == 'type' and \
1296 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1297 type = value.upper()
1298 fp, retrlen = fw.retrfile(file, type)
1299 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001300 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001301 if mtype:
1302 headers += "Content-type: %s\n" % mtype
1303 if retrlen is not None and retrlen >= 0:
1304 headers += "Content-length: %d\n" % retrlen
1305 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001306 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001307 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001308 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001309 raise exc.with_traceback(sys.exc_info()[2])
1310
1311 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1312 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1313 return fw
1314
1315class CacheFTPHandler(FTPHandler):
1316 # XXX would be nice to have pluggable cache strategies
1317 # XXX this stuff is definitely not thread safe
1318 def __init__(self):
1319 self.cache = {}
1320 self.timeout = {}
1321 self.soonest = 0
1322 self.delay = 60
1323 self.max_conns = 16
1324
1325 def setTimeout(self, t):
1326 self.delay = t
1327
1328 def setMaxConns(self, m):
1329 self.max_conns = m
1330
1331 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1332 key = user, host, port, '/'.join(dirs), timeout
1333 if key in self.cache:
1334 self.timeout[key] = time.time() + self.delay
1335 else:
1336 self.cache[key] = ftpwrapper(user, passwd, host, port,
1337 dirs, timeout)
1338 self.timeout[key] = time.time() + self.delay
1339 self.check_cache()
1340 return self.cache[key]
1341
1342 def check_cache(self):
1343 # first check for old ones
1344 t = time.time()
1345 if self.soonest <= t:
1346 for k, v in list(self.timeout.items()):
1347 if v < t:
1348 self.cache[k].close()
1349 del self.cache[k]
1350 del self.timeout[k]
1351 self.soonest = min(list(self.timeout.values()))
1352
1353 # then check the size
1354 if len(self.cache) == self.max_conns:
1355 for k, v in list(self.timeout.items()):
1356 if v == self.soonest:
1357 del self.cache[k]
1358 del self.timeout[k]
1359 break
1360 self.soonest = min(list(self.timeout.values()))
1361
1362# Code move from the old urllib module
1363
1364MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1365
1366# Helper for non-unix systems
1367if os.name == 'mac':
1368 from macurl2path import url2pathname, pathname2url
1369elif os.name == 'nt':
1370 from nturl2path import url2pathname, pathname2url
1371else:
1372 def url2pathname(pathname):
1373 """OS-specific conversion from a relative URL of the 'file' scheme
1374 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001375 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001376
1377 def pathname2url(pathname):
1378 """OS-specific conversion from a file system path to a relative URL
1379 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001380 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001381
1382# This really consists of two pieces:
1383# (1) a class which handles opening of all sorts of URLs
1384# (plus assorted utilities etc.)
1385# (2) a set of functions for parsing URLs
1386# XXX Should these be separated out into different modules?
1387
1388
1389ftpcache = {}
1390class URLopener:
1391 """Class to open URLs.
1392 This is a class rather than just a subroutine because we may need
1393 more than one set of global protocol-specific options.
1394 Note -- this is a base class for those who don't want the
1395 automatic handling of errors type 302 (relocated) and 401
1396 (authorization needed)."""
1397
1398 __tempfiles = None
1399
1400 version = "Python-urllib/%s" % __version__
1401
1402 # Constructor
1403 def __init__(self, proxies=None, **x509):
1404 if proxies is None:
1405 proxies = getproxies()
1406 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1407 self.proxies = proxies
1408 self.key_file = x509.get('key_file')
1409 self.cert_file = x509.get('cert_file')
1410 self.addheaders = [('User-Agent', self.version)]
1411 self.__tempfiles = []
1412 self.__unlink = os.unlink # See cleanup()
1413 self.tempcache = None
1414 # Undocumented feature: if you assign {} to tempcache,
1415 # it is used to cache files retrieved with
1416 # self.retrieve(). This is not enabled by default
1417 # since it does not work for changing documents (and I
1418 # haven't got the logic to check expiration headers
1419 # yet).
1420 self.ftpcache = ftpcache
1421 # Undocumented feature: you can use a different
1422 # ftp cache by assigning to the .ftpcache member;
1423 # in case you want logically independent URL openers
1424 # XXX This is not threadsafe. Bah.
1425
1426 def __del__(self):
1427 self.close()
1428
1429 def close(self):
1430 self.cleanup()
1431
1432 def cleanup(self):
1433 # This code sometimes runs when the rest of this module
1434 # has already been deleted, so it can't use any globals
1435 # or import anything.
1436 if self.__tempfiles:
1437 for file in self.__tempfiles:
1438 try:
1439 self.__unlink(file)
1440 except OSError:
1441 pass
1442 del self.__tempfiles[:]
1443 if self.tempcache:
1444 self.tempcache.clear()
1445
1446 def addheader(self, *args):
1447 """Add a header to be used by the HTTP interface only
1448 e.g. u.addheader('Accept', 'sound/basic')"""
1449 self.addheaders.append(args)
1450
1451 # External interface
1452 def open(self, fullurl, data=None):
1453 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001454 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran0e7e9ae2010-02-20 22:30:21 +00001455 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001456 if self.tempcache and fullurl in self.tempcache:
1457 filename, headers = self.tempcache[fullurl]
1458 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001459 return addinfourl(fp, headers, fullurl)
1460 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001461 if not urltype:
1462 urltype = 'file'
1463 if urltype in self.proxies:
1464 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001465 urltype, proxyhost = splittype(proxy)
1466 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001467 url = (host, fullurl) # Signal special case to open_*()
1468 else:
1469 proxy = None
1470 name = 'open_' + urltype
1471 self.type = urltype
1472 name = name.replace('-', '_')
1473 if not hasattr(self, name):
1474 if proxy:
1475 return self.open_unknown_proxy(proxy, fullurl, data)
1476 else:
1477 return self.open_unknown(fullurl, data)
1478 try:
1479 if data is None:
1480 return getattr(self, name)(url)
1481 else:
1482 return getattr(self, name)(url, data)
1483 except socket.error as msg:
1484 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1485
1486 def open_unknown(self, fullurl, data=None):
1487 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001488 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001489 raise IOError('url error', 'unknown url type', type)
1490
1491 def open_unknown_proxy(self, proxy, fullurl, data=None):
1492 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001493 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001494 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1495
1496 # External interface
1497 def retrieve(self, url, filename=None, reporthook=None, data=None):
1498 """retrieve(url) returns (filename, headers) for a local object
1499 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001500 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001501 if self.tempcache and url in self.tempcache:
1502 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001503 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001504 if filename is None and (not type or type == 'file'):
1505 try:
1506 fp = self.open_local_file(url1)
1507 hdrs = fp.info()
1508 del fp
Georg Brandl13e89462008-07-01 19:56:00 +00001509 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001510 except IOError as msg:
1511 pass
1512 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001513 try:
1514 headers = fp.info()
1515 if filename:
1516 tfp = open(filename, 'wb')
1517 else:
1518 import tempfile
1519 garbage, path = splittype(url)
1520 garbage, path = splithost(path or "")
1521 path, garbage = splitquery(path or "")
1522 path, garbage = splitattr(path or "")
1523 suffix = os.path.splitext(path)[1]
1524 (fd, filename) = tempfile.mkstemp(suffix)
1525 self.__tempfiles.append(filename)
1526 tfp = os.fdopen(fd, 'wb')
1527 try:
1528 result = filename, headers
1529 if self.tempcache is not None:
1530 self.tempcache[url] = result
1531 bs = 1024*8
1532 size = -1
1533 read = 0
1534 blocknum = 0
1535 if reporthook:
1536 if "content-length" in headers:
1537 size = int(headers["Content-Length"])
1538 reporthook(blocknum, bs, size)
1539 while 1:
1540 block = fp.read(bs)
1541 if not block:
1542 break
1543 read += len(block)
1544 tfp.write(block)
1545 blocknum += 1
1546 if reporthook:
1547 reporthook(blocknum, bs, size)
1548 finally:
1549 tfp.close()
1550 finally:
1551 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001552 del fp
1553 del tfp
1554
1555 # raise exception if actual size does not match content-length header
1556 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001557 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001558 "retrieval incomplete: got only %i out of %i bytes"
1559 % (read, size), result)
1560
1561 return result
1562
1563 # Each method named open_<type> knows how to open that type of URL
1564
1565 def _open_generic_http(self, connection_factory, url, data):
1566 """Make an HTTP connection using connection_class.
1567
1568 This is an internal method that should be called from
1569 open_http() or open_https().
1570
1571 Arguments:
1572 - connection_factory should take a host name and return an
1573 HTTPConnection instance.
1574 - url is the url to retrieval or a host, relative-path pair.
1575 - data is payload for a POST request or None.
1576 """
1577
1578 user_passwd = None
1579 proxy_passwd= None
1580 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001581 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001582 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001583 user_passwd, host = splituser(host)
1584 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001585 realhost = host
1586 else:
1587 host, selector = url
1588 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001589 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001590 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001591 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001592 url = rest
1593 user_passwd = None
1594 if urltype.lower() != 'http':
1595 realhost = None
1596 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001597 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001598 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001599 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001600 if user_passwd:
1601 selector = "%s://%s%s" % (urltype, realhost, rest)
1602 if proxy_bypass(realhost):
1603 host = realhost
1604
1605 #print "proxy via http:", host, selector
1606 if not host: raise IOError('http error', 'no host given')
1607
1608 if proxy_passwd:
1609 import base64
Senthil Kumaranfe2f4ec2010-08-04 17:49:13 +00001610 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001611 else:
1612 proxy_auth = None
1613
1614 if user_passwd:
1615 import base64
Senthil Kumaranfe2f4ec2010-08-04 17:49:13 +00001616 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001617 else:
1618 auth = None
1619 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001620 headers = {}
1621 if proxy_auth:
1622 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1623 if auth:
1624 headers["Authorization"] = "Basic %s" % auth
1625 if realhost:
1626 headers["Host"] = realhost
1627 for header, value in self.addheaders:
1628 headers[header] = value
1629
1630 if data is not None:
1631 headers["Content-Type"] = "application/x-www-form-urlencoded"
1632 http_conn.request("POST", selector, data, headers)
1633 else:
1634 http_conn.request("GET", selector, headers=headers)
1635
1636 try:
1637 response = http_conn.getresponse()
1638 except http.client.BadStatusLine:
1639 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001640 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001641
1642 # According to RFC 2616, "2xx" code indicates that the client's
1643 # request was successfully received, understood, and accepted.
1644 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001645 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001646 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001647 else:
1648 return self.http_error(
1649 url, response.fp,
1650 response.status, response.reason, response.msg, data)
1651
1652 def open_http(self, url, data=None):
1653 """Use HTTP protocol."""
1654 return self._open_generic_http(http.client.HTTPConnection, url, data)
1655
1656 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1657 """Handle http errors.
1658
1659 Derived class can override this, or provide specific handlers
1660 named http_error_DDD where DDD is the 3-digit error code."""
1661 # First check if there's a specific handler for this error
1662 name = 'http_error_%d' % errcode
1663 if hasattr(self, name):
1664 method = getattr(self, name)
1665 if data is None:
1666 result = method(url, fp, errcode, errmsg, headers)
1667 else:
1668 result = method(url, fp, errcode, errmsg, headers, data)
1669 if result: return result
1670 return self.http_error_default(url, fp, errcode, errmsg, headers)
1671
1672 def http_error_default(self, url, fp, errcode, errmsg, headers):
1673 """Default error handler: close the connection and raise IOError."""
1674 void = fp.read()
1675 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001676 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001677
1678 if _have_ssl:
1679 def _https_connection(self, host):
1680 return http.client.HTTPSConnection(host,
1681 key_file=self.key_file,
1682 cert_file=self.cert_file)
1683
1684 def open_https(self, url, data=None):
1685 """Use HTTPS protocol."""
1686 return self._open_generic_http(self._https_connection, url, data)
1687
1688 def open_file(self, url):
1689 """Use local file or FTP depending on form of URL."""
1690 if not isinstance(url, str):
1691 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1692 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1693 return self.open_ftp(url)
1694 else:
1695 return self.open_local_file(url)
1696
1697 def open_local_file(self, url):
1698 """Use local file."""
1699 import mimetypes, email.utils
1700 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001701 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001702 localname = url2pathname(file)
1703 try:
1704 stats = os.stat(localname)
1705 except OSError as e:
1706 raise URLError(e.errno, e.strerror, e.filename)
1707 size = stats.st_size
1708 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1709 mtype = mimetypes.guess_type(url)[0]
1710 headers = email.message_from_string(
1711 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1712 (mtype or 'text/plain', size, modified))
1713 if not host:
1714 urlfile = file
1715 if file[:1] == '/':
1716 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001717 return addinfourl(open(localname, 'rb'), headers, urlfile)
1718 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001719 if (not port
Senthil Kumaran88a495d2009-12-27 10:15:45 +00001720 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001721 urlfile = file
1722 if file[:1] == '/':
1723 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001724 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001725 raise URLError('local file error', 'not on local host')
1726
1727 def open_ftp(self, url):
1728 """Use FTP protocol."""
1729 if not isinstance(url, str):
1730 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1731 import mimetypes
1732 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001733 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001734 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001735 host, port = splitport(host)
1736 user, host = splituser(host)
1737 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001738 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001739 host = unquote(host)
1740 user = unquote(user or '')
1741 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001742 host = socket.gethostbyname(host)
1743 if not port:
1744 import ftplib
1745 port = ftplib.FTP_PORT
1746 else:
1747 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001748 path, attrs = splitattr(path)
1749 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001750 dirs = path.split('/')
1751 dirs, file = dirs[:-1], dirs[-1]
1752 if dirs and not dirs[0]: dirs = dirs[1:]
1753 if dirs and not dirs[0]: dirs[0] = '/'
1754 key = user, host, port, '/'.join(dirs)
1755 # XXX thread unsafe!
1756 if len(self.ftpcache) > MAXFTPCACHE:
1757 # Prune the cache, rather arbitrarily
1758 for k in self.ftpcache.keys():
1759 if k != key:
1760 v = self.ftpcache[k]
1761 del self.ftpcache[k]
1762 v.close()
1763 try:
1764 if not key in self.ftpcache:
1765 self.ftpcache[key] = \
1766 ftpwrapper(user, passwd, host, port, dirs)
1767 if not file: type = 'D'
1768 else: type = 'I'
1769 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001770 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001771 if attr.lower() == 'type' and \
1772 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1773 type = value.upper()
1774 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1775 mtype = mimetypes.guess_type("ftp:" + url)[0]
1776 headers = ""
1777 if mtype:
1778 headers += "Content-Type: %s\n" % mtype
1779 if retrlen is not None and retrlen >= 0:
1780 headers += "Content-Length: %d\n" % retrlen
1781 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001782 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001783 except ftperrors() as msg:
1784 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1785
1786 def open_data(self, url, data=None):
1787 """Use "data" URL."""
1788 if not isinstance(url, str):
1789 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1790 # ignore POSTed data
1791 #
1792 # syntax of data URLs:
1793 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1794 # mediatype := [ type "/" subtype ] *( ";" parameter )
1795 # data := *urlchar
1796 # parameter := attribute "=" value
1797 try:
1798 [type, data] = url.split(',', 1)
1799 except ValueError:
1800 raise IOError('data error', 'bad data URL')
1801 if not type:
1802 type = 'text/plain;charset=US-ASCII'
1803 semi = type.rfind(';')
1804 if semi >= 0 and '=' not in type[semi:]:
1805 encoding = type[semi+1:]
1806 type = type[:semi]
1807 else:
1808 encoding = ''
1809 msg = []
Senthil Kumaran5a3bc652010-05-01 08:32:23 +00001810 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001811 time.gmtime(time.time())))
1812 msg.append('Content-type: %s' % type)
1813 if encoding == 'base64':
1814 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001815 # XXX is this encoding/decoding ok?
1816 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001817 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001818 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001819 msg.append('Content-Length: %d' % len(data))
1820 msg.append('')
1821 msg.append(data)
1822 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001823 headers = email.message_from_string(msg)
1824 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001825 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001826 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001827
1828
1829class FancyURLopener(URLopener):
1830 """Derived class with handlers for errors we can handle (perhaps)."""
1831
1832 def __init__(self, *args, **kwargs):
1833 URLopener.__init__(self, *args, **kwargs)
1834 self.auth_cache = {}
1835 self.tries = 0
1836 self.maxtries = 10
1837
1838 def http_error_default(self, url, fp, errcode, errmsg, headers):
1839 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001840 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001841
1842 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1843 """Error 302 -- relocated (temporarily)."""
1844 self.tries += 1
1845 if self.maxtries and self.tries >= self.maxtries:
1846 if hasattr(self, "http_error_500"):
1847 meth = self.http_error_500
1848 else:
1849 meth = self.http_error_default
1850 self.tries = 0
1851 return meth(url, fp, 500,
1852 "Internal Server Error: Redirect Recursion", headers)
1853 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1854 data)
1855 self.tries = 0
1856 return result
1857
1858 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1859 if 'location' in headers:
1860 newurl = headers['location']
1861 elif 'uri' in headers:
1862 newurl = headers['uri']
1863 else:
1864 return
1865 void = fp.read()
1866 fp.close()
1867 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001868 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001869 return self.open(newurl)
1870
1871 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1872 """Error 301 -- also relocated (permanently)."""
1873 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1874
1875 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1876 """Error 303 -- also relocated (essentially identical to 302)."""
1877 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1878
1879 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1880 """Error 307 -- relocated, but turn POST into error."""
1881 if data is None:
1882 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1883 else:
1884 return self.http_error_default(url, fp, errcode, errmsg, headers)
1885
Senthil Kumaranb4d1c2c2010-06-18 15:12:48 +00001886 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1887 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001888 """Error 401 -- authentication required.
1889 This function supports Basic authentication only."""
1890 if not 'www-authenticate' in headers:
1891 URLopener.http_error_default(self, url, fp,
1892 errcode, errmsg, headers)
1893 stuff = headers['www-authenticate']
1894 import re
1895 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1896 if not match:
1897 URLopener.http_error_default(self, url, fp,
1898 errcode, errmsg, headers)
1899 scheme, realm = match.groups()
1900 if scheme.lower() != 'basic':
1901 URLopener.http_error_default(self, url, fp,
1902 errcode, errmsg, headers)
Senthil Kumaranb4d1c2c2010-06-18 15:12:48 +00001903 if not retry:
1904 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1905 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001906 name = 'retry_' + self.type + '_basic_auth'
1907 if data is None:
1908 return getattr(self,name)(url, realm)
1909 else:
1910 return getattr(self,name)(url, realm, data)
1911
Senthil Kumaranb4d1c2c2010-06-18 15:12:48 +00001912 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
1913 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001914 """Error 407 -- proxy authentication required.
1915 This function supports Basic authentication only."""
1916 if not 'proxy-authenticate' in headers:
1917 URLopener.http_error_default(self, url, fp,
1918 errcode, errmsg, headers)
1919 stuff = headers['proxy-authenticate']
1920 import re
1921 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1922 if not match:
1923 URLopener.http_error_default(self, url, fp,
1924 errcode, errmsg, headers)
1925 scheme, realm = match.groups()
1926 if scheme.lower() != 'basic':
1927 URLopener.http_error_default(self, url, fp,
1928 errcode, errmsg, headers)
Senthil Kumaranb4d1c2c2010-06-18 15:12:48 +00001929 if not retry:
1930 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1931 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001932 name = 'retry_proxy_' + self.type + '_basic_auth'
1933 if data is None:
1934 return getattr(self,name)(url, realm)
1935 else:
1936 return getattr(self,name)(url, realm, data)
1937
1938 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001939 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001940 newurl = 'http://' + host + selector
1941 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001942 urltype, proxyhost = splittype(proxy)
1943 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001944 i = proxyhost.find('@') + 1
1945 proxyhost = proxyhost[i:]
1946 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1947 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001948 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001949 quote(passwd, safe=''), proxyhost)
1950 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1951 if data is None:
1952 return self.open(newurl)
1953 else:
1954 return self.open(newurl, data)
1955
1956 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001957 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001958 newurl = 'https://' + host + selector
1959 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001960 urltype, proxyhost = splittype(proxy)
1961 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001962 i = proxyhost.find('@') + 1
1963 proxyhost = proxyhost[i:]
1964 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1965 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001966 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001967 quote(passwd, safe=''), proxyhost)
1968 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1969 if data is None:
1970 return self.open(newurl)
1971 else:
1972 return self.open(newurl, data)
1973
1974 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001975 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001976 i = host.find('@') + 1
1977 host = host[i:]
1978 user, passwd = self.get_user_passwd(host, realm, i)
1979 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001980 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001981 quote(passwd, safe=''), host)
1982 newurl = 'http://' + host + selector
1983 if data is None:
1984 return self.open(newurl)
1985 else:
1986 return self.open(newurl, data)
1987
1988 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001989 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001990 i = host.find('@') + 1
1991 host = host[i:]
1992 user, passwd = self.get_user_passwd(host, realm, i)
1993 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001994 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001995 quote(passwd, safe=''), host)
1996 newurl = 'https://' + host + selector
1997 if data is None:
1998 return self.open(newurl)
1999 else:
2000 return self.open(newurl, data)
2001
Florent Xicluna37ddbb82010-08-14 21:06:29 +00002002 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002003 key = realm + '@' + host.lower()
2004 if key in self.auth_cache:
2005 if clear_cache:
2006 del self.auth_cache[key]
2007 else:
2008 return self.auth_cache[key]
2009 user, passwd = self.prompt_user_passwd(host, realm)
2010 if user or passwd: self.auth_cache[key] = (user, passwd)
2011 return user, passwd
2012
2013 def prompt_user_passwd(self, host, realm):
2014 """Override this in a GUI environment!"""
2015 import getpass
2016 try:
2017 user = input("Enter username for %s at %s: " % (realm, host))
2018 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2019 (user, realm, host))
2020 return user, passwd
2021 except KeyboardInterrupt:
2022 print()
2023 return None, None
2024
2025
2026# Utility functions
2027
2028_localhost = None
2029def localhost():
2030 """Return the IP address of the magic hostname 'localhost'."""
2031 global _localhost
2032 if _localhost is None:
2033 _localhost = socket.gethostbyname('localhost')
2034 return _localhost
2035
2036_thishost = None
2037def thishost():
Senthil Kumaran88a495d2009-12-27 10:15:45 +00002038 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002039 global _thishost
2040 if _thishost is None:
Senthil Kumaran88a495d2009-12-27 10:15:45 +00002041 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2]))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002042 return _thishost
2043
2044_ftperrors = None
2045def ftperrors():
2046 """Return the set of errors raised by the FTP class."""
2047 global _ftperrors
2048 if _ftperrors is None:
2049 import ftplib
2050 _ftperrors = ftplib.all_errors
2051 return _ftperrors
2052
2053_noheaders = None
2054def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002055 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002056 global _noheaders
2057 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002058 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002059 return _noheaders
2060
2061
2062# Utility classes
2063
2064class ftpwrapper:
2065 """Class used by open_ftp() for cache of open FTP connections."""
2066
2067 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2068 self.user = user
2069 self.passwd = passwd
2070 self.host = host
2071 self.port = port
2072 self.dirs = dirs
2073 self.timeout = timeout
2074 self.init()
2075
2076 def init(self):
2077 import ftplib
2078 self.busy = 0
2079 self.ftp = ftplib.FTP()
2080 self.ftp.connect(self.host, self.port, self.timeout)
2081 self.ftp.login(self.user, self.passwd)
2082 for dir in self.dirs:
2083 self.ftp.cwd(dir)
2084
2085 def retrfile(self, file, type):
2086 import ftplib
2087 self.endtransfer()
2088 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2089 else: cmd = 'TYPE ' + type; isdir = 0
2090 try:
2091 self.ftp.voidcmd(cmd)
2092 except ftplib.all_errors:
2093 self.init()
2094 self.ftp.voidcmd(cmd)
2095 conn = None
2096 if file and not isdir:
2097 # Try to retrieve as a file
2098 try:
2099 cmd = 'RETR ' + file
2100 conn = self.ftp.ntransfercmd(cmd)
2101 except ftplib.error_perm as reason:
2102 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002103 raise URLError('ftp error', reason).with_traceback(
2104 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002105 if not conn:
2106 # Set transfer mode to ASCII!
2107 self.ftp.voidcmd('TYPE A')
2108 # Try a directory listing. Verify that directory exists.
2109 if file:
2110 pwd = self.ftp.pwd()
2111 try:
2112 try:
2113 self.ftp.cwd(file)
2114 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002115 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002116 finally:
2117 self.ftp.cwd(pwd)
2118 cmd = 'LIST ' + file
2119 else:
2120 cmd = 'LIST'
2121 conn = self.ftp.ntransfercmd(cmd)
2122 self.busy = 1
2123 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002124 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002125 def endtransfer(self):
2126 if not self.busy:
2127 return
2128 self.busy = 0
2129 try:
2130 self.ftp.voidresp()
2131 except ftperrors():
2132 pass
2133
2134 def close(self):
2135 self.endtransfer()
2136 try:
2137 self.ftp.close()
2138 except ftperrors():
2139 pass
2140
2141# Proxy handling
2142def getproxies_environment():
2143 """Return a dictionary of scheme -> proxy server URL mappings.
2144
2145 Scan the environment for variables named <scheme>_proxy;
2146 this seems to be the standard convention. If you need a
2147 different way, you can pass a proxies dictionary to the
2148 [Fancy]URLopener constructor.
2149
2150 """
2151 proxies = {}
2152 for name, value in os.environ.items():
2153 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002154 if value and name[-6:] == '_proxy':
2155 proxies[name[:-6]] = value
2156 return proxies
2157
2158def proxy_bypass_environment(host):
2159 """Test if proxies should not be used for a particular host.
2160
2161 Checks the environment for a variable named no_proxy, which should
2162 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2163 """
2164 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2165 # '*' is special case for always bypass
2166 if no_proxy == '*':
2167 return 1
2168 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002169 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002170 # check if the host ends with any of the DNS suffixes
2171 for name in no_proxy.split(','):
2172 if name and (hostonly.endswith(name) or host.endswith(name)):
2173 return 1
2174 # otherwise, don't bypass
2175 return 0
2176
2177
2178if sys.platform == 'darwin':
Ronald Oussoren218cc582010-04-18 20:49:34 +00002179 from _scproxy import _get_proxy_settings, _get_proxies
2180
2181 def proxy_bypass_macosx_sysconf(host):
2182 """
2183 Return True iff this host shouldn't be accessed using a proxy
2184
2185 This function uses the MacOSX framework SystemConfiguration
2186 to fetch the proxy information.
2187 """
2188 import re
2189 import socket
2190 from fnmatch import fnmatch
2191
2192 hostonly, port = splitport(host)
2193
2194 def ip2num(ipAddr):
2195 parts = ipAddr.split('.')
Mark Dickinsonb7d94362010-05-09 12:17:58 +00002196 parts = list(map(int, parts))
Ronald Oussoren218cc582010-04-18 20:49:34 +00002197 if len(parts) != 4:
2198 parts = (parts + [0, 0, 0, 0])[:4]
2199 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2200
2201 proxy_settings = _get_proxy_settings()
2202
2203 # Check for simple host names:
2204 if '.' not in host:
2205 if proxy_settings['exclude_simple']:
2206 return True
2207
2208 hostIP = None
2209
2210 for value in proxy_settings.get('exceptions', ()):
2211 # Items in the list are strings like these: *.local, 169.254/16
2212 if not value: continue
2213
2214 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2215 if m is not None:
2216 if hostIP is None:
2217 try:
2218 hostIP = socket.gethostbyname(hostonly)
2219 hostIP = ip2num(hostIP)
2220 except socket.error:
2221 continue
2222
2223 base = ip2num(m.group(1))
Ronald Oussorenddb62e92010-06-27 14:27:27 +00002224 mask = m.group(2)
2225 if mask is None:
2226 mask = 8 * (m.group(1).count('.') + 1)
2227
2228 else:
2229 mask = int(mask[1:])
2230 mask = 32 - mask
Ronald Oussoren218cc582010-04-18 20:49:34 +00002231
2232 if (hostIP >> mask) == (base >> mask):
2233 return True
2234
2235 elif fnmatch(host, value):
2236 return True
2237
2238 return False
2239
2240
2241 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002242 """Return a dictionary of scheme -> proxy server URL mappings.
2243
Ronald Oussoren218cc582010-04-18 20:49:34 +00002244 This function uses the MacOSX framework SystemConfiguration
2245 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002246 """
Ronald Oussoren218cc582010-04-18 20:49:34 +00002247 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002248
Ronald Oussoren218cc582010-04-18 20:49:34 +00002249
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002250
2251 def proxy_bypass(host):
2252 if getproxies_environment():
2253 return proxy_bypass_environment(host)
2254 else:
Ronald Oussoren218cc582010-04-18 20:49:34 +00002255 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002256
2257 def getproxies():
Ronald Oussoren218cc582010-04-18 20:49:34 +00002258 return getproxies_environment() or getproxies_macosx_sysconf()
2259
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002260
2261elif os.name == 'nt':
2262 def getproxies_registry():
2263 """Return a dictionary of scheme -> proxy server URL mappings.
2264
2265 Win32 uses the registry to store proxies.
2266
2267 """
2268 proxies = {}
2269 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002270 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002271 except ImportError:
2272 # Std module, so should be around - but you never know!
2273 return proxies
2274 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002275 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002276 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002277 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002278 'ProxyEnable')[0]
2279 if proxyEnable:
2280 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002281 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002282 'ProxyServer')[0])
2283 if '=' in proxyServer:
2284 # Per-protocol settings
2285 for p in proxyServer.split(';'):
2286 protocol, address = p.split('=', 1)
2287 # See if address has a type:// prefix
2288 import re
2289 if not re.match('^([^/:]+)://', address):
2290 address = '%s://%s' % (protocol, address)
2291 proxies[protocol] = address
2292 else:
2293 # Use one setting for all protocols
2294 if proxyServer[:5] == 'http:':
2295 proxies['http'] = proxyServer
2296 else:
2297 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran1ea57a62010-07-14 20:13:28 +00002298 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002299 proxies['ftp'] = 'ftp://%s' % proxyServer
2300 internetSettings.Close()
2301 except (WindowsError, ValueError, TypeError):
2302 # Either registry key not found etc, or the value in an
2303 # unexpected format.
2304 # proxies already set up to be empty so nothing to do
2305 pass
2306 return proxies
2307
2308 def getproxies():
2309 """Return a dictionary of scheme -> proxy server URL mappings.
2310
2311 Returns settings gathered from the environment, if specified,
2312 or the registry.
2313
2314 """
2315 return getproxies_environment() or getproxies_registry()
2316
2317 def proxy_bypass_registry(host):
2318 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002319 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002320 import re
2321 except ImportError:
2322 # Std modules, so should be around - but you never know!
2323 return 0
2324 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002325 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002326 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002327 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002328 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002329 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002330 'ProxyOverride')[0])
2331 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2332 except WindowsError:
2333 return 0
2334 if not proxyEnable or not proxyOverride:
2335 return 0
2336 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002337 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002338 host = [rawHost]
2339 try:
2340 addr = socket.gethostbyname(rawHost)
2341 if addr != rawHost:
2342 host.append(addr)
2343 except socket.error:
2344 pass
2345 try:
2346 fqdn = socket.getfqdn(rawHost)
2347 if fqdn != rawHost:
2348 host.append(fqdn)
2349 except socket.error:
2350 pass
2351 # make a check value list from the registry entry: replace the
2352 # '<local>' string by the localhost entry and the corresponding
2353 # canonical entry.
2354 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002355 # now check if we match one of the registry values.
2356 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002357 if test == '<local>':
2358 if '.' not in rawHost:
2359 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002360 test = test.replace(".", r"\.") # mask dots
2361 test = test.replace("*", r".*") # change glob sequence
2362 test = test.replace("?", r".") # change glob char
2363 for val in host:
2364 # print "%s <--> %s" %( test, val )
2365 if re.match(test, val, re.I):
2366 return 1
2367 return 0
2368
2369 def proxy_bypass(host):
2370 """Return a dictionary of scheme -> proxy server URL mappings.
2371
2372 Returns settings gathered from the environment, if specified,
2373 or the registry.
2374
2375 """
2376 if getproxies_environment():
2377 return proxy_bypass_environment(host)
2378 else:
2379 return proxy_bypass_registry(host)
2380
2381else:
2382 # By default use environment variables
2383 getproxies = getproxies_environment
2384 proxy_bypass = proxy_bypass_environment