blob: da1314786b6fcbecd3564e1286b230f15d81f6b4 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001# Issues in merging urllib and urllib2:
2# 1. They both define a function named urlopen()
3
4"""An extensible library for opening URLs using a variety of protocols
5
6The simplest way to use this module is to call the urlopen function,
7which accepts a string containing a URL or a Request object (described
8below). It opens the URL and returns the results as file-like
9object; the returned object has some extra methods described below.
10
11The OpenerDirector manages a collection of Handler objects that do
12all the actual work. Each Handler implements a particular protocol or
13option. The OpenerDirector is a composite object that invokes the
14Handlers needed to open the requested URL. For example, the
15HTTPHandler performs HTTP GET and POST requests and deals with
16non-error returns. The HTTPRedirectHandler automatically deals with
17HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
18deals with digest authentication.
19
20urlopen(url, data=None) -- Basic usage is the same as original
21urllib. pass the url and optionally data to post to an HTTP URL, and
22get a file-like object back. One difference is that you can also pass
23a Request instance instead of URL. Raises a URLError (subclass of
24IOError); for HTTP errors, raises an HTTPError, which can also be
25treated as a valid response.
26
27build_opener -- Function that creates a new OpenerDirector instance.
28Will install the default handlers. Accepts one or more Handlers as
29arguments, either instances or Handler classes that it will
30instantiate. If one of the argument is a subclass of the default
31handler, the argument will be installed instead of the default.
32
33install_opener -- Installs a new opener as the default opener.
34
35objects of interest:
36OpenerDirector --
37
38Request -- An object that encapsulates the state of a request. The
39state can be as simple as the URL. It can also include extra HTTP
40headers, e.g. a User-Agent.
41
42BaseHandler --
43
44internals:
45BaseHandler and parent
46_call_chain conventions
47
48Example usage:
49
50import urllib2
51
52# set up authentication info
53authinfo = urllib2.HTTPBasicAuthHandler()
54authinfo.add_password(realm='PDQ Application',
55 uri='https://mahler:8092/site-updates.py',
56 user='klem',
57 passwd='geheim$parole')
58
59proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
60
61# build a new opener that adds authentication and caching FTP handlers
62opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
63
64# install it
65urllib2.install_opener(opener)
66
67f = urllib2.urlopen('http://www.python.org/')
68"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
85import email
86import hashlib
87import http.client
88import io
89import os
90import posixpath
91import random
92import re
93import socket
94import sys
95import time
96import urllib.parse, urllib.error, urllib.response
97import bisect
98
99from io import StringIO
100
101# check for SSL
102try:
103 import ssl
104except:
105 _have_ssl = False
106else:
107 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000108
109# used in User-Agent header sent
110__version__ = sys.version[:3]
111
112_opener = None
113def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
114 global _opener
115 if _opener is None:
116 _opener = build_opener()
117 return _opener.open(url, data, timeout)
118
119def install_opener(opener):
120 global _opener
121 _opener = opener
122
123# TODO(jhylton): Make this work with the same global opener.
124_urlopener = None
125def urlretrieve(url, filename=None, reporthook=None, data=None):
126 global _urlopener
127 if not _urlopener:
128 _urlopener = FancyURLopener()
129 return _urlopener.retrieve(url, filename, reporthook, data)
130
131def urlcleanup():
132 if _urlopener:
133 _urlopener.cleanup()
134 global _opener
135 if _opener:
136 _opener = None
137
138# copied from cookielib.py
139_cut_port_re = re.compile(r":\d+$")
140def request_host(request):
141 """Return request-host, as defined by RFC 2965.
142
143 Variation from RFC: returned value is lowercased, for convenient
144 comparison.
145
146 """
147 url = request.get_full_url()
148 host = urllib.parse.urlparse(url)[1]
149 if host == "":
150 host = request.get_header("Host", "")
151
152 # remove port, if present
153 host = _cut_port_re.sub("", host, 1)
154 return host.lower()
155
156class Request:
157
158 def __init__(self, url, data=None, headers={},
159 origin_req_host=None, unverifiable=False):
160 # unwrap('<URL:type://host/path>') --> 'type://host/path'
161 self.__original = urllib.parse.unwrap(url)
162 self.type = None
163 # self.__r_type is what's left after doing the splittype
164 self.host = None
165 self.port = None
166 self.data = data
167 self.headers = {}
168 for key, value in headers.items():
169 self.add_header(key, value)
170 self.unredirected_hdrs = {}
171 if origin_req_host is None:
172 origin_req_host = request_host(self)
173 self.origin_req_host = origin_req_host
174 self.unverifiable = unverifiable
175
176 def __getattr__(self, attr):
177 # XXX this is a fallback mechanism to guard against these
178 # methods getting called in a non-standard order. this may be
179 # too complicated and/or unnecessary.
180 # XXX should the __r_XXX attributes be public?
181 if attr[:12] == '_Request__r_':
182 name = attr[12:]
183 if hasattr(Request, 'get_' + name):
184 getattr(self, 'get_' + name)()
185 return getattr(self, attr)
186 raise AttributeError(attr)
187
188 def get_method(self):
189 if self.has_data():
190 return "POST"
191 else:
192 return "GET"
193
194 # XXX these helper methods are lame
195
196 def add_data(self, data):
197 self.data = data
198
199 def has_data(self):
200 return self.data is not None
201
202 def get_data(self):
203 return self.data
204
205 def get_full_url(self):
206 return self.__original
207
208 def get_type(self):
209 if self.type is None:
210 self.type, self.__r_type = urllib.parse.splittype(self.__original)
211 if self.type is None:
212 raise ValueError("unknown url type: %s" % self.__original)
213 return self.type
214
215 def get_host(self):
216 if self.host is None:
217 self.host, self.__r_host = urllib.parse.splithost(self.__r_type)
218 if self.host:
219 self.host = urllib.parse.unquote(self.host)
220 return self.host
221
222 def get_selector(self):
223 return self.__r_host
224
225 def set_proxy(self, host, type):
226 self.host, self.type = host, type
227 self.__r_host = self.__original
228
229 def get_origin_req_host(self):
230 return self.origin_req_host
231
232 def is_unverifiable(self):
233 return self.unverifiable
234
235 def add_header(self, key, val):
236 # useful for something like authentication
237 self.headers[key.capitalize()] = val
238
239 def add_unredirected_header(self, key, val):
240 # will not be added to a redirected request
241 self.unredirected_hdrs[key.capitalize()] = val
242
243 def has_header(self, header_name):
244 return (header_name in self.headers or
245 header_name in self.unredirected_hdrs)
246
247 def get_header(self, header_name, default=None):
248 return self.headers.get(
249 header_name,
250 self.unredirected_hdrs.get(header_name, default))
251
252 def header_items(self):
253 hdrs = self.unredirected_hdrs.copy()
254 hdrs.update(self.headers)
255 return list(hdrs.items())
256
257class OpenerDirector:
258 def __init__(self):
259 client_version = "Python-urllib/%s" % __version__
260 self.addheaders = [('User-agent', client_version)]
261 # manage the individual handlers
262 self.handlers = []
263 self.handle_open = {}
264 self.handle_error = {}
265 self.process_response = {}
266 self.process_request = {}
267
268 def add_handler(self, handler):
269 if not hasattr(handler, "add_parent"):
270 raise TypeError("expected BaseHandler instance, got %r" %
271 type(handler))
272
273 added = False
274 for meth in dir(handler):
275 if meth in ["redirect_request", "do_open", "proxy_open"]:
276 # oops, coincidental match
277 continue
278
279 i = meth.find("_")
280 protocol = meth[:i]
281 condition = meth[i+1:]
282
283 if condition.startswith("error"):
284 j = condition.find("_") + i + 1
285 kind = meth[j+1:]
286 try:
287 kind = int(kind)
288 except ValueError:
289 pass
290 lookup = self.handle_error.get(protocol, {})
291 self.handle_error[protocol] = lookup
292 elif condition == "open":
293 kind = protocol
294 lookup = self.handle_open
295 elif condition == "response":
296 kind = protocol
297 lookup = self.process_response
298 elif condition == "request":
299 kind = protocol
300 lookup = self.process_request
301 else:
302 continue
303
304 handlers = lookup.setdefault(kind, [])
305 if handlers:
306 bisect.insort(handlers, handler)
307 else:
308 handlers.append(handler)
309 added = True
310
311 if added:
312 # the handlers must work in an specific order, the order
313 # is specified in a Handler attribute
314 bisect.insort(self.handlers, handler)
315 handler.add_parent(self)
316
317 def close(self):
318 # Only exists for backwards compatibility.
319 pass
320
321 def _call_chain(self, chain, kind, meth_name, *args):
322 # Handlers raise an exception if no one else should try to handle
323 # the request, or return None if they can't but another handler
324 # could. Otherwise, they return the response.
325 handlers = chain.get(kind, ())
326 for handler in handlers:
327 func = getattr(handler, meth_name)
328
329 result = func(*args)
330 if result is not None:
331 return result
332
333 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
334 # accept a URL or a Request object
335 if isinstance(fullurl, str):
336 req = Request(fullurl, data)
337 else:
338 req = fullurl
339 if data is not None:
340 req.add_data(data)
341
342 req.timeout = timeout
343 protocol = req.get_type()
344
345 # pre-process request
346 meth_name = protocol+"_request"
347 for processor in self.process_request.get(protocol, []):
348 meth = getattr(processor, meth_name)
349 req = meth(req)
350
351 response = self._open(req, data)
352
353 # post-process response
354 meth_name = protocol+"_response"
355 for processor in self.process_response.get(protocol, []):
356 meth = getattr(processor, meth_name)
357 response = meth(req, response)
358
359 return response
360
361 def _open(self, req, data=None):
362 result = self._call_chain(self.handle_open, 'default',
363 'default_open', req)
364 if result:
365 return result
366
367 protocol = req.get_type()
368 result = self._call_chain(self.handle_open, protocol, protocol +
369 '_open', req)
370 if result:
371 return result
372
373 return self._call_chain(self.handle_open, 'unknown',
374 'unknown_open', req)
375
376 def error(self, proto, *args):
377 if proto in ('http', 'https'):
378 # XXX http[s] protocols are special-cased
379 dict = self.handle_error['http'] # https is not different than http
380 proto = args[2] # YUCK!
381 meth_name = 'http_error_%s' % proto
382 http_err = 1
383 orig_args = args
384 else:
385 dict = self.handle_error
386 meth_name = proto + '_error'
387 http_err = 0
388 args = (dict, proto, meth_name) + args
389 result = self._call_chain(*args)
390 if result:
391 return result
392
393 if http_err:
394 args = (dict, 'default', 'http_error_default') + orig_args
395 return self._call_chain(*args)
396
397# XXX probably also want an abstract factory that knows when it makes
398# sense to skip a superclass in favor of a subclass and when it might
399# make sense to include both
400
401def build_opener(*handlers):
402 """Create an opener object from a list of handlers.
403
404 The opener will use several default handlers, including support
405 for HTTP and FTP.
406
407 If any of the handlers passed as arguments are subclasses of the
408 default handlers, the default handlers will not be used.
409 """
410 def isclass(obj):
411 return isinstance(obj, type) or hasattr(obj, "__bases__")
412
413 opener = OpenerDirector()
414 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
415 HTTPDefaultErrorHandler, HTTPRedirectHandler,
416 FTPHandler, FileHandler, HTTPErrorProcessor]
417 if hasattr(http.client, "HTTPSConnection"):
418 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000419 skip = set()
420 for klass in default_classes:
421 for check in handlers:
422 if isclass(check):
423 if issubclass(check, klass):
424 skip.add(klass)
425 elif isinstance(check, klass):
426 skip.add(klass)
427 for klass in skip:
428 default_classes.remove(klass)
429
430 for klass in default_classes:
431 opener.add_handler(klass())
432
433 for h in handlers:
434 if isclass(h):
435 h = h()
436 opener.add_handler(h)
437 return opener
438
439class BaseHandler:
440 handler_order = 500
441
442 def add_parent(self, parent):
443 self.parent = parent
444
445 def close(self):
446 # Only exists for backwards compatibility
447 pass
448
449 def __lt__(self, other):
450 if not hasattr(other, "handler_order"):
451 # Try to preserve the old behavior of having custom classes
452 # inserted after default ones (works only for custom user
453 # classes which are not aware of handler_order).
454 return True
455 return self.handler_order < other.handler_order
456
457
458class HTTPErrorProcessor(BaseHandler):
459 """Process HTTP error responses."""
460 handler_order = 1000 # after all other processing
461
462 def http_response(self, request, response):
463 code, msg, hdrs = response.code, response.msg, response.info()
464
465 # According to RFC 2616, "2xx" code indicates that the client's
466 # request was successfully received, understood, and accepted.
467 if not (200 <= code < 300):
468 response = self.parent.error(
469 'http', request, response, code, msg, hdrs)
470
471 return response
472
473 https_response = http_response
474
475class HTTPDefaultErrorHandler(BaseHandler):
476 def http_error_default(self, req, fp, code, msg, hdrs):
477 raise urllib.error.HTTPError(req.get_full_url(), code, msg, hdrs, fp)
478
479class HTTPRedirectHandler(BaseHandler):
480 # maximum number of redirections to any single URL
481 # this is needed because of the state that cookies introduce
482 max_repeats = 4
483 # maximum total number of redirections (regardless of URL) before
484 # assuming we're in a loop
485 max_redirections = 10
486
487 def redirect_request(self, req, fp, code, msg, headers, newurl):
488 """Return a Request or None in response to a redirect.
489
490 This is called by the http_error_30x methods when a
491 redirection response is received. If a redirection should
492 take place, return a new Request to allow http_error_30x to
493 perform the redirect. Otherwise, raise HTTPError if no-one
494 else should try to handle this url. Return None if you can't
495 but another Handler might.
496 """
497 m = req.get_method()
498 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
499 or code in (301, 302, 303) and m == "POST")):
500 raise urllib.error.HTTPError(req.get_full_url(),
501 code, msg, headers, fp)
502
503 # Strictly (according to RFC 2616), 301 or 302 in response to
504 # a POST MUST NOT cause a redirection without confirmation
505 # from the user (of urllib2, in this case). In practice,
506 # essentially all clients do redirect in this case, so we do
507 # the same.
508 # be conciliant with URIs containing a space
509 newurl = newurl.replace(' ', '%20')
510 CONTENT_HEADERS = ("content-length", "content-type")
511 newheaders = dict((k, v) for k, v in req.headers.items()
512 if k.lower() not in CONTENT_HEADERS)
513 return Request(newurl,
514 headers=newheaders,
515 origin_req_host=req.get_origin_req_host(),
516 unverifiable=True)
517
518 # Implementation note: To avoid the server sending us into an
519 # infinite loop, the request object needs to track what URLs we
520 # have already seen. Do this by adding a handler-specific
521 # attribute to the Request object.
522 def http_error_302(self, req, fp, code, msg, headers):
523 # Some servers (incorrectly) return multiple Location headers
524 # (so probably same goes for URI). Use first header.
525 if "location" in headers:
526 newurl = headers["location"]
527 elif "uri" in headers:
528 newurl = headers["uri"]
529 else:
530 return
531 newurl = urllib.parse.urljoin(req.get_full_url(), newurl)
532
533 # XXX Probably want to forget about the state of the current
534 # request, although that might interact poorly with other
535 # handlers that also use handler-specific request attributes
536 new = self.redirect_request(req, fp, code, msg, headers, newurl)
537 if new is None:
538 return
539
540 # loop detection
541 # .redirect_dict has a key url if url was previously visited.
542 if hasattr(req, 'redirect_dict'):
543 visited = new.redirect_dict = req.redirect_dict
544 if (visited.get(newurl, 0) >= self.max_repeats or
545 len(visited) >= self.max_redirections):
546 raise urllib.error.HTTPError(req.get_full_url(), code,
547 self.inf_msg + msg, headers, fp)
548 else:
549 visited = new.redirect_dict = req.redirect_dict = {}
550 visited[newurl] = visited.get(newurl, 0) + 1
551
552 # Don't close the fp until we are sure that we won't use it
553 # with HTTPError.
554 fp.read()
555 fp.close()
556
557 return self.parent.open(new)
558
559 http_error_301 = http_error_303 = http_error_307 = http_error_302
560
561 inf_msg = "The HTTP server returned a redirect error that would " \
562 "lead to an infinite loop.\n" \
563 "The last 30x error message was:\n"
564
565
566def _parse_proxy(proxy):
567 """Return (scheme, user, password, host/port) given a URL or an authority.
568
569 If a URL is supplied, it must have an authority (host:port) component.
570 According to RFC 3986, having an authority component means the URL must
571 have two slashes after the scheme:
572
573 >>> _parse_proxy('file:/ftp.example.com/')
574 Traceback (most recent call last):
575 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
576
577 The first three items of the returned tuple may be None.
578
579 Examples of authority parsing:
580
581 >>> _parse_proxy('proxy.example.com')
582 (None, None, None, 'proxy.example.com')
583 >>> _parse_proxy('proxy.example.com:3128')
584 (None, None, None, 'proxy.example.com:3128')
585
586 The authority component may optionally include userinfo (assumed to be
587 username:password):
588
589 >>> _parse_proxy('joe:password@proxy.example.com')
590 (None, 'joe', 'password', 'proxy.example.com')
591 >>> _parse_proxy('joe:password@proxy.example.com:3128')
592 (None, 'joe', 'password', 'proxy.example.com:3128')
593
594 Same examples, but with URLs instead:
595
596 >>> _parse_proxy('http://proxy.example.com/')
597 ('http', None, None, 'proxy.example.com')
598 >>> _parse_proxy('http://proxy.example.com:3128/')
599 ('http', None, None, 'proxy.example.com:3128')
600 >>> _parse_proxy('http://joe:password@proxy.example.com/')
601 ('http', 'joe', 'password', 'proxy.example.com')
602 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
603 ('http', 'joe', 'password', 'proxy.example.com:3128')
604
605 Everything after the authority is ignored:
606
607 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
608 ('ftp', 'joe', 'password', 'proxy.example.com')
609
610 Test for no trailing '/' case:
611
612 >>> _parse_proxy('http://joe:password@proxy.example.com')
613 ('http', 'joe', 'password', 'proxy.example.com')
614
615 """
616 scheme, r_scheme = urllib.parse.splittype(proxy)
617 if not r_scheme.startswith("/"):
618 # authority
619 scheme = None
620 authority = proxy
621 else:
622 # URL
623 if not r_scheme.startswith("//"):
624 raise ValueError("proxy URL with no authority: %r" % proxy)
625 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
626 # and 3.3.), path is empty or starts with '/'
627 end = r_scheme.find("/", 2)
628 if end == -1:
629 end = None
630 authority = r_scheme[2:end]
631 userinfo, hostport = urllib.parse.splituser(authority)
632 if userinfo is not None:
633 user, password = urllib.parse.splitpasswd(userinfo)
634 else:
635 user = password = None
636 return scheme, user, password, hostport
637
638class ProxyHandler(BaseHandler):
639 # Proxies must be in front
640 handler_order = 100
641
642 def __init__(self, proxies=None):
643 if proxies is None:
644 proxies = getproxies()
645 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
646 self.proxies = proxies
647 for type, url in proxies.items():
648 setattr(self, '%s_open' % type,
649 lambda r, proxy=url, type=type, meth=self.proxy_open: \
650 meth(r, proxy, type))
651
652 def proxy_open(self, req, proxy, type):
653 orig_type = req.get_type()
654 proxy_type, user, password, hostport = _parse_proxy(proxy)
655 if proxy_type is None:
656 proxy_type = orig_type
657 if user and password:
658 user_pass = '%s:%s' % (unquote(user),
659 urllib.parse.unquote(password))
660 creds = base64.b64encode(user_pass.encode()).decode("ascii")
661 req.add_header('Proxy-authorization', 'Basic ' + creds)
662 hostport = urllib.parse.unquote(hostport)
663 req.set_proxy(hostport, proxy_type)
664 if orig_type == proxy_type:
665 # let other handlers take care of it
666 return None
667 else:
668 # need to start over, because the other handlers don't
669 # grok the proxy's URL type
670 # e.g. if we have a constructor arg proxies like so:
671 # {'http': 'ftp://proxy.example.com'}, we may end up turning
672 # a request for http://acme.example.com/a into one for
673 # ftp://proxy.example.com/a
674 return self.parent.open(req)
675
676class HTTPPasswordMgr:
677
678 def __init__(self):
679 self.passwd = {}
680
681 def add_password(self, realm, uri, user, passwd):
682 # uri could be a single URI or a sequence
683 if isinstance(uri, str):
684 uri = [uri]
685 if not realm in self.passwd:
686 self.passwd[realm] = {}
687 for default_port in True, False:
688 reduced_uri = tuple(
689 [self.reduce_uri(u, default_port) for u in uri])
690 self.passwd[realm][reduced_uri] = (user, passwd)
691
692 def find_user_password(self, realm, authuri):
693 domains = self.passwd.get(realm, {})
694 for default_port in True, False:
695 reduced_authuri = self.reduce_uri(authuri, default_port)
696 for uris, authinfo in domains.items():
697 for uri in uris:
698 if self.is_suburi(uri, reduced_authuri):
699 return authinfo
700 return None, None
701
702 def reduce_uri(self, uri, default_port=True):
703 """Accept authority or URI and extract only the authority and path."""
704 # note HTTP URLs do not have a userinfo component
705 parts = urllib.parse.urlsplit(uri)
706 if parts[1]:
707 # URI
708 scheme = parts[0]
709 authority = parts[1]
710 path = parts[2] or '/'
711 else:
712 # host or host:port
713 scheme = None
714 authority = uri
715 path = '/'
716 host, port = urllib.parse.splitport(authority)
717 if default_port and port is None and scheme is not None:
718 dport = {"http": 80,
719 "https": 443,
720 }.get(scheme)
721 if dport is not None:
722 authority = "%s:%d" % (host, dport)
723 return authority, path
724
725 def is_suburi(self, base, test):
726 """Check if test is below base in a URI tree
727
728 Both args must be URIs in reduced form.
729 """
730 if base == test:
731 return True
732 if base[0] != test[0]:
733 return False
734 common = posixpath.commonprefix((base[1], test[1]))
735 if len(common) == len(base[1]):
736 return True
737 return False
738
739
740class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
741
742 def find_user_password(self, realm, authuri):
743 user, password = HTTPPasswordMgr.find_user_password(self, realm,
744 authuri)
745 if user is not None:
746 return user, password
747 return HTTPPasswordMgr.find_user_password(self, None, authuri)
748
749
750class AbstractBasicAuthHandler:
751
752 # XXX this allows for multiple auth-schemes, but will stupidly pick
753 # the last one with a realm specified.
754
755 # allow for double- and single-quoted realm values
756 # (single quotes are a violation of the RFC, but appear in the wild)
757 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
758 'realm=(["\'])(.*?)\\2', re.I)
759
760 # XXX could pre-emptively send auth info already accepted (RFC 2617,
761 # end of section 2, and section 1.2 immediately after "credentials"
762 # production).
763
764 def __init__(self, password_mgr=None):
765 if password_mgr is None:
766 password_mgr = HTTPPasswordMgr()
767 self.passwd = password_mgr
768 self.add_password = self.passwd.add_password
769
770 def http_error_auth_reqed(self, authreq, host, req, headers):
771 # host may be an authority (without userinfo) or a URL with an
772 # authority
773 # XXX could be multiple headers
774 authreq = headers.get(authreq, None)
775 if authreq:
776 mo = AbstractBasicAuthHandler.rx.search(authreq)
777 if mo:
778 scheme, quote, realm = mo.groups()
779 if scheme.lower() == 'basic':
780 return self.retry_http_basic_auth(host, req, realm)
781
782 def retry_http_basic_auth(self, host, req, realm):
783 user, pw = self.passwd.find_user_password(realm, host)
784 if pw is not None:
785 raw = "%s:%s" % (user, pw)
786 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
787 if req.headers.get(self.auth_header, None) == auth:
788 return None
789 req.add_header(self.auth_header, auth)
790 return self.parent.open(req)
791 else:
792 return None
793
794
795class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
796
797 auth_header = 'Authorization'
798
799 def http_error_401(self, req, fp, code, msg, headers):
800 url = req.get_full_url()
801 return self.http_error_auth_reqed('www-authenticate',
802 url, req, headers)
803
804
805class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
806
807 auth_header = 'Proxy-authorization'
808
809 def http_error_407(self, req, fp, code, msg, headers):
810 # http_error_auth_reqed requires that there is no userinfo component in
811 # authority. Assume there isn't one, since urllib2 does not (and
812 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
813 # userinfo.
814 authority = req.get_host()
815 return self.http_error_auth_reqed('proxy-authenticate',
816 authority, req, headers)
817
818
819def randombytes(n):
820 """Return n random bytes."""
821 return os.urandom(n)
822
823class AbstractDigestAuthHandler:
824 # Digest authentication is specified in RFC 2617.
825
826 # XXX The client does not inspect the Authentication-Info header
827 # in a successful response.
828
829 # XXX It should be possible to test this implementation against
830 # a mock server that just generates a static set of challenges.
831
832 # XXX qop="auth-int" supports is shaky
833
834 def __init__(self, passwd=None):
835 if passwd is None:
836 passwd = HTTPPasswordMgr()
837 self.passwd = passwd
838 self.add_password = self.passwd.add_password
839 self.retried = 0
840 self.nonce_count = 0
841
842 def reset_retry_count(self):
843 self.retried = 0
844
845 def http_error_auth_reqed(self, auth_header, host, req, headers):
846 authreq = headers.get(auth_header, None)
847 if self.retried > 5:
848 # Don't fail endlessly - if we failed once, we'll probably
849 # fail a second time. Hm. Unless the Password Manager is
850 # prompting for the information. Crap. This isn't great
851 # but it's better than the current 'repeat until recursion
852 # depth exceeded' approach <wink>
853 raise urllib.error.HTTPError(req.get_full_url(), 401,
854 "digest auth failed",
855 headers, None)
856 else:
857 self.retried += 1
858 if authreq:
859 scheme = authreq.split()[0]
860 if scheme.lower() == 'digest':
861 return self.retry_http_digest_auth(req, authreq)
862
863 def retry_http_digest_auth(self, req, auth):
864 token, challenge = auth.split(' ', 1)
865 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
866 auth = self.get_authorization(req, chal)
867 if auth:
868 auth_val = 'Digest %s' % auth
869 if req.headers.get(self.auth_header, None) == auth_val:
870 return None
871 req.add_unredirected_header(self.auth_header, auth_val)
872 resp = self.parent.open(req)
873 return resp
874
875 def get_cnonce(self, nonce):
876 # The cnonce-value is an opaque
877 # quoted string value provided by the client and used by both client
878 # and server to avoid chosen plaintext attacks, to provide mutual
879 # authentication, and to provide some message integrity protection.
880 # This isn't a fabulous effort, but it's probably Good Enough.
881 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
882 b = s.encode("ascii") + randombytes(8)
883 dig = hashlib.sha1(b).hexdigest()
884 return dig[:16]
885
886 def get_authorization(self, req, chal):
887 try:
888 realm = chal['realm']
889 nonce = chal['nonce']
890 qop = chal.get('qop')
891 algorithm = chal.get('algorithm', 'MD5')
892 # mod_digest doesn't send an opaque, even though it isn't
893 # supposed to be optional
894 opaque = chal.get('opaque', None)
895 except KeyError:
896 return None
897
898 H, KD = self.get_algorithm_impls(algorithm)
899 if H is None:
900 return None
901
902 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
903 if user is None:
904 return None
905
906 # XXX not implemented yet
907 if req.has_data():
908 entdig = self.get_entity_digest(req.get_data(), chal)
909 else:
910 entdig = None
911
912 A1 = "%s:%s:%s" % (user, realm, pw)
913 A2 = "%s:%s" % (req.get_method(),
914 # XXX selector: what about proxies and full urls
915 req.get_selector())
916 if qop == 'auth':
917 self.nonce_count += 1
918 ncvalue = '%08x' % self.nonce_count
919 cnonce = self.get_cnonce(nonce)
920 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
921 respdig = KD(H(A1), noncebit)
922 elif qop is None:
923 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
924 else:
925 # XXX handle auth-int.
926 raise urllib.error.URLError("qop '%s' is not supported." % qop)
927
928 # XXX should the partial digests be encoded too?
929
930 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
931 'response="%s"' % (user, realm, nonce, req.get_selector(),
932 respdig)
933 if opaque:
934 base += ', opaque="%s"' % opaque
935 if entdig:
936 base += ', digest="%s"' % entdig
937 base += ', algorithm="%s"' % algorithm
938 if qop:
939 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
940 return base
941
942 def get_algorithm_impls(self, algorithm):
943 # lambdas assume digest modules are imported at the top level
944 if algorithm == 'MD5':
945 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
946 elif algorithm == 'SHA':
947 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
948 # XXX MD5-sess
949 KD = lambda s, d: H("%s:%s" % (s, d))
950 return H, KD
951
952 def get_entity_digest(self, data, chal):
953 # XXX not implemented yet
954 return None
955
956
957class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
958 """An authentication protocol defined by RFC 2069
959
960 Digest authentication improves on basic authentication because it
961 does not transmit passwords in the clear.
962 """
963
964 auth_header = 'Authorization'
965 handler_order = 490 # before Basic auth
966
967 def http_error_401(self, req, fp, code, msg, headers):
968 host = urllib.parse.urlparse(req.get_full_url())[1]
969 retry = self.http_error_auth_reqed('www-authenticate',
970 host, req, headers)
971 self.reset_retry_count()
972 return retry
973
974
975class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
976
977 auth_header = 'Proxy-Authorization'
978 handler_order = 490 # before Basic auth
979
980 def http_error_407(self, req, fp, code, msg, headers):
981 host = req.get_host()
982 retry = self.http_error_auth_reqed('proxy-authenticate',
983 host, req, headers)
984 self.reset_retry_count()
985 return retry
986
987class AbstractHTTPHandler(BaseHandler):
988
989 def __init__(self, debuglevel=0):
990 self._debuglevel = debuglevel
991
992 def set_http_debuglevel(self, level):
993 self._debuglevel = level
994
995 def do_request_(self, request):
996 host = request.get_host()
997 if not host:
998 raise urllib.error.URLError('no host given')
999
1000 if request.has_data(): # POST
1001 data = request.get_data()
1002 if not request.has_header('Content-type'):
1003 request.add_unredirected_header(
1004 'Content-type',
1005 'application/x-www-form-urlencoded')
1006 if not request.has_header('Content-length'):
1007 request.add_unredirected_header(
1008 'Content-length', '%d' % len(data))
1009
1010 scheme, sel = urllib.parse.splittype(request.get_selector())
1011 sel_host, sel_path = urllib.parse.splithost(sel)
1012 if not request.has_header('Host'):
1013 request.add_unredirected_header('Host', sel_host or host)
1014 for name, value in self.parent.addheaders:
1015 name = name.capitalize()
1016 if not request.has_header(name):
1017 request.add_unredirected_header(name, value)
1018
1019 return request
1020
1021 def do_open(self, http_class, req):
1022 """Return an addinfourl object for the request, using http_class.
1023
1024 http_class must implement the HTTPConnection API from http.client.
1025 The addinfourl return value is a file-like object. It also
1026 has methods and attributes including:
1027 - info(): return a mimetools.Message object for the headers
1028 - geturl(): return the original request URL
1029 - code: HTTP status code
1030 """
1031 host = req.get_host()
1032 if not host:
1033 raise urllib.error.URLError('no host given')
1034
1035 h = http_class(host, timeout=req.timeout) # will parse host:port
1036 headers = dict(req.headers)
1037 headers.update(req.unredirected_hdrs)
1038
1039 # TODO(jhylton): Should this be redesigned to handle
1040 # persistent connections?
1041
1042 # We want to make an HTTP/1.1 request, but the addinfourl
1043 # class isn't prepared to deal with a persistent connection.
1044 # It will try to read all remaining data from the socket,
1045 # which will block while the server waits for the next request.
1046 # So make sure the connection gets closed after the (only)
1047 # request.
1048 headers["Connection"] = "close"
1049 headers = dict(
1050 (name.title(), val) for name, val in headers.items())
1051 try:
1052 h.request(req.get_method(), req.get_selector(), req.data, headers)
1053 r = h.getresponse()
1054 except socket.error as err: # XXX what error?
1055 raise urllib.error.URLError(err)
1056
1057 resp = urllib.response.addinfourl(r.fp, r.msg, req.get_full_url())
1058 resp.code = r.status
1059 resp.msg = r.reason
1060 return resp
1061
1062
1063class HTTPHandler(AbstractHTTPHandler):
1064
1065 def http_open(self, req):
1066 return self.do_open(http.client.HTTPConnection, req)
1067
1068 http_request = AbstractHTTPHandler.do_request_
1069
1070if hasattr(http.client, 'HTTPSConnection'):
1071 class HTTPSHandler(AbstractHTTPHandler):
1072
1073 def https_open(self, req):
1074 return self.do_open(http.client.HTTPSConnection, req)
1075
1076 https_request = AbstractHTTPHandler.do_request_
1077
1078class HTTPCookieProcessor(BaseHandler):
1079 def __init__(self, cookiejar=None):
1080 import http.cookiejar
1081 if cookiejar is None:
1082 cookiejar = http.cookiejar.CookieJar()
1083 self.cookiejar = cookiejar
1084
1085 def http_request(self, request):
1086 self.cookiejar.add_cookie_header(request)
1087 return request
1088
1089 def http_response(self, request, response):
1090 self.cookiejar.extract_cookies(response, request)
1091 return response
1092
1093 https_request = http_request
1094 https_response = http_response
1095
1096class UnknownHandler(BaseHandler):
1097 def unknown_open(self, req):
1098 type = req.get_type()
1099 raise urllib.error.URLError('unknown url type: %s' % type)
1100
1101def parse_keqv_list(l):
1102 """Parse list of key=value strings where keys are not duplicated."""
1103 parsed = {}
1104 for elt in l:
1105 k, v = elt.split('=', 1)
1106 if v[0] == '"' and v[-1] == '"':
1107 v = v[1:-1]
1108 parsed[k] = v
1109 return parsed
1110
1111def parse_http_list(s):
1112 """Parse lists as described by RFC 2068 Section 2.
1113
1114 In particular, parse comma-separated lists where the elements of
1115 the list may include quoted-strings. A quoted-string could
1116 contain a comma. A non-quoted string could have quotes in the
1117 middle. Neither commas nor quotes count if they are escaped.
1118 Only double-quotes count, not single-quotes.
1119 """
1120 res = []
1121 part = ''
1122
1123 escape = quote = False
1124 for cur in s:
1125 if escape:
1126 part += cur
1127 escape = False
1128 continue
1129 if quote:
1130 if cur == '\\':
1131 escape = True
1132 continue
1133 elif cur == '"':
1134 quote = False
1135 part += cur
1136 continue
1137
1138 if cur == ',':
1139 res.append(part)
1140 part = ''
1141 continue
1142
1143 if cur == '"':
1144 quote = True
1145
1146 part += cur
1147
1148 # append last part
1149 if part:
1150 res.append(part)
1151
1152 return [part.strip() for part in res]
1153
1154class FileHandler(BaseHandler):
1155 # Use local file or FTP depending on form of URL
1156 def file_open(self, req):
1157 url = req.get_selector()
1158 if url[:2] == '//' and url[2:3] != '/':
1159 req.type = 'ftp'
1160 return self.parent.open(req)
1161 else:
1162 return self.open_local_file(req)
1163
1164 # names for the localhost
1165 names = None
1166 def get_names(self):
1167 if FileHandler.names is None:
1168 try:
1169 FileHandler.names = (socket.gethostbyname('localhost'),
1170 socket.gethostbyname(socket.gethostname()))
1171 except socket.gaierror:
1172 FileHandler.names = (socket.gethostbyname('localhost'),)
1173 return FileHandler.names
1174
1175 # not entirely sure what the rules are here
1176 def open_local_file(self, req):
1177 import email.utils
1178 import mimetypes
1179 host = req.get_host()
1180 file = req.get_selector()
1181 localfile = url2pathname(file)
1182 try:
1183 stats = os.stat(localfile)
1184 size = stats.st_size
1185 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1186 mtype = mimetypes.guess_type(file)[0]
1187 headers = email.message_from_string(
1188 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1189 (mtype or 'text/plain', size, modified))
1190 if host:
1191 host, port = urllib.parse.splitport(host)
1192 if not host or \
1193 (not port and _safe_gethostbyname(host) in self.get_names()):
1194 return urllib.response.addinfourl(open(localfile, 'rb'),
1195 headers, 'file:'+file)
1196 except OSError as msg:
1197 # urllib2 users shouldn't expect OSErrors coming from urlopen()
1198 raise urllib.error.URLError(msg)
1199 raise urllib.error.URLError('file not on local host')
1200
1201def _safe_gethostbyname(host):
1202 try:
1203 return socket.gethostbyname(host)
1204 except socket.gaierror:
1205 return None
1206
1207class FTPHandler(BaseHandler):
1208 def ftp_open(self, req):
1209 import ftplib
1210 import mimetypes
1211 host = req.get_host()
1212 if not host:
1213 raise urllib.error.URLError('ftp error: no host given')
1214 host, port = urllib.parse.splitport(host)
1215 if port is None:
1216 port = ftplib.FTP_PORT
1217 else:
1218 port = int(port)
1219
1220 # username/password handling
1221 user, host = urllib.parse.splituser(host)
1222 if user:
1223 user, passwd = urllib.parse.splitpasswd(user)
1224 else:
1225 passwd = None
1226 host = urllib.parse.unquote(host)
1227 user = urllib.parse.unquote(user or '')
1228 passwd = urllib.parse.unquote(passwd or '')
1229
1230 try:
1231 host = socket.gethostbyname(host)
1232 except socket.error as msg:
1233 raise urllib.error.URLError(msg)
1234 path, attrs = urllib.parse.splitattr(req.get_selector())
1235 dirs = path.split('/')
1236 dirs = list(map(urllib.parse.unquote, dirs))
1237 dirs, file = dirs[:-1], dirs[-1]
1238 if dirs and not dirs[0]:
1239 dirs = dirs[1:]
1240 try:
1241 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1242 type = file and 'I' or 'D'
1243 for attr in attrs:
1244 attr, value = urllib.parse.splitvalue(attr)
1245 if attr.lower() == 'type' and \
1246 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1247 type = value.upper()
1248 fp, retrlen = fw.retrfile(file, type)
1249 headers = ""
1250 mtype = mimetypes.guess_type(req.get_full_url())[0]
1251 if mtype:
1252 headers += "Content-type: %s\n" % mtype
1253 if retrlen is not None and retrlen >= 0:
1254 headers += "Content-length: %d\n" % retrlen
1255 headers = email.message_from_string(headers)
1256 return urllib.response.addinfourl(fp, headers, req.get_full_url())
1257 except ftplib.all_errors as msg:
1258 exc = urllib.error.URLError('ftp error: %s' % msg)
1259 raise exc.with_traceback(sys.exc_info()[2])
1260
1261 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1262 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1263 return fw
1264
1265class CacheFTPHandler(FTPHandler):
1266 # XXX would be nice to have pluggable cache strategies
1267 # XXX this stuff is definitely not thread safe
1268 def __init__(self):
1269 self.cache = {}
1270 self.timeout = {}
1271 self.soonest = 0
1272 self.delay = 60
1273 self.max_conns = 16
1274
1275 def setTimeout(self, t):
1276 self.delay = t
1277
1278 def setMaxConns(self, m):
1279 self.max_conns = m
1280
1281 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1282 key = user, host, port, '/'.join(dirs), timeout
1283 if key in self.cache:
1284 self.timeout[key] = time.time() + self.delay
1285 else:
1286 self.cache[key] = ftpwrapper(user, passwd, host, port,
1287 dirs, timeout)
1288 self.timeout[key] = time.time() + self.delay
1289 self.check_cache()
1290 return self.cache[key]
1291
1292 def check_cache(self):
1293 # first check for old ones
1294 t = time.time()
1295 if self.soonest <= t:
1296 for k, v in list(self.timeout.items()):
1297 if v < t:
1298 self.cache[k].close()
1299 del self.cache[k]
1300 del self.timeout[k]
1301 self.soonest = min(list(self.timeout.values()))
1302
1303 # then check the size
1304 if len(self.cache) == self.max_conns:
1305 for k, v in list(self.timeout.items()):
1306 if v == self.soonest:
1307 del self.cache[k]
1308 del self.timeout[k]
1309 break
1310 self.soonest = min(list(self.timeout.values()))
1311
1312# Code move from the old urllib module
1313
1314MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1315
1316# Helper for non-unix systems
1317if os.name == 'mac':
1318 from macurl2path import url2pathname, pathname2url
1319elif os.name == 'nt':
1320 from nturl2path import url2pathname, pathname2url
1321else:
1322 def url2pathname(pathname):
1323 """OS-specific conversion from a relative URL of the 'file' scheme
1324 to a file system path; not recommended for general use."""
1325 return urllib.parse.unquote(pathname)
1326
1327 def pathname2url(pathname):
1328 """OS-specific conversion from a file system path to a relative URL
1329 of the 'file' scheme; not recommended for general use."""
1330 return urllib.parse.quote(pathname)
1331
1332# This really consists of two pieces:
1333# (1) a class which handles opening of all sorts of URLs
1334# (plus assorted utilities etc.)
1335# (2) a set of functions for parsing URLs
1336# XXX Should these be separated out into different modules?
1337
1338
1339ftpcache = {}
1340class URLopener:
1341 """Class to open URLs.
1342 This is a class rather than just a subroutine because we may need
1343 more than one set of global protocol-specific options.
1344 Note -- this is a base class for those who don't want the
1345 automatic handling of errors type 302 (relocated) and 401
1346 (authorization needed)."""
1347
1348 __tempfiles = None
1349
1350 version = "Python-urllib/%s" % __version__
1351
1352 # Constructor
1353 def __init__(self, proxies=None, **x509):
1354 if proxies is None:
1355 proxies = getproxies()
1356 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1357 self.proxies = proxies
1358 self.key_file = x509.get('key_file')
1359 self.cert_file = x509.get('cert_file')
1360 self.addheaders = [('User-Agent', self.version)]
1361 self.__tempfiles = []
1362 self.__unlink = os.unlink # See cleanup()
1363 self.tempcache = None
1364 # Undocumented feature: if you assign {} to tempcache,
1365 # it is used to cache files retrieved with
1366 # self.retrieve(). This is not enabled by default
1367 # since it does not work for changing documents (and I
1368 # haven't got the logic to check expiration headers
1369 # yet).
1370 self.ftpcache = ftpcache
1371 # Undocumented feature: you can use a different
1372 # ftp cache by assigning to the .ftpcache member;
1373 # in case you want logically independent URL openers
1374 # XXX This is not threadsafe. Bah.
1375
1376 def __del__(self):
1377 self.close()
1378
1379 def close(self):
1380 self.cleanup()
1381
1382 def cleanup(self):
1383 # This code sometimes runs when the rest of this module
1384 # has already been deleted, so it can't use any globals
1385 # or import anything.
1386 if self.__tempfiles:
1387 for file in self.__tempfiles:
1388 try:
1389 self.__unlink(file)
1390 except OSError:
1391 pass
1392 del self.__tempfiles[:]
1393 if self.tempcache:
1394 self.tempcache.clear()
1395
1396 def addheader(self, *args):
1397 """Add a header to be used by the HTTP interface only
1398 e.g. u.addheader('Accept', 'sound/basic')"""
1399 self.addheaders.append(args)
1400
1401 # External interface
1402 def open(self, fullurl, data=None):
1403 """Use URLopener().open(file) instead of open(file, 'r')."""
1404 fullurl = urllib.parse.unwrap(urllib.parse.toBytes(fullurl))
1405 if self.tempcache and fullurl in self.tempcache:
1406 filename, headers = self.tempcache[fullurl]
1407 fp = open(filename, 'rb')
1408 return urllib.response.addinfourl(fp, headers, fullurl)
1409 urltype, url = urllib.parse.splittype(fullurl)
1410 if not urltype:
1411 urltype = 'file'
1412 if urltype in self.proxies:
1413 proxy = self.proxies[urltype]
1414 urltype, proxyhost = urllib.parse.splittype(proxy)
1415 host, selector = urllib.parse.splithost(proxyhost)
1416 url = (host, fullurl) # Signal special case to open_*()
1417 else:
1418 proxy = None
1419 name = 'open_' + urltype
1420 self.type = urltype
1421 name = name.replace('-', '_')
1422 if not hasattr(self, name):
1423 if proxy:
1424 return self.open_unknown_proxy(proxy, fullurl, data)
1425 else:
1426 return self.open_unknown(fullurl, data)
1427 try:
1428 if data is None:
1429 return getattr(self, name)(url)
1430 else:
1431 return getattr(self, name)(url, data)
1432 except socket.error as msg:
1433 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1434
1435 def open_unknown(self, fullurl, data=None):
1436 """Overridable interface to open unknown URL type."""
1437 type, url = urllib.parse.splittype(fullurl)
1438 raise IOError('url error', 'unknown url type', type)
1439
1440 def open_unknown_proxy(self, proxy, fullurl, data=None):
1441 """Overridable interface to open unknown URL type."""
1442 type, url = urllib.parse.splittype(fullurl)
1443 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1444
1445 # External interface
1446 def retrieve(self, url, filename=None, reporthook=None, data=None):
1447 """retrieve(url) returns (filename, headers) for a local object
1448 or (tempfilename, headers) for a remote object."""
1449 url = urllib.parse.unwrap(urllib.parse.toBytes(url))
1450 if self.tempcache and url in self.tempcache:
1451 return self.tempcache[url]
1452 type, url1 = urllib.parse.splittype(url)
1453 if filename is None and (not type or type == 'file'):
1454 try:
1455 fp = self.open_local_file(url1)
1456 hdrs = fp.info()
1457 del fp
1458 return url2pathname(urllib.parse.splithost(url1)[1]), hdrs
1459 except IOError as msg:
1460 pass
1461 fp = self.open(url, data)
1462 headers = fp.info()
1463 if filename:
1464 tfp = open(filename, 'wb')
1465 else:
1466 import tempfile
1467 garbage, path = urllib.parse.splittype(url)
1468 garbage, path = urllib.parse.splithost(path or "")
1469 path, garbage = urllib.parse.splitquery(path or "")
1470 path, garbage = urllib.parse.splitattr(path or "")
1471 suffix = os.path.splitext(path)[1]
1472 (fd, filename) = tempfile.mkstemp(suffix)
1473 self.__tempfiles.append(filename)
1474 tfp = os.fdopen(fd, 'wb')
1475 result = filename, headers
1476 if self.tempcache is not None:
1477 self.tempcache[url] = result
1478 bs = 1024*8
1479 size = -1
1480 read = 0
1481 blocknum = 0
1482 if reporthook:
1483 if "content-length" in headers:
1484 size = int(headers["Content-Length"])
1485 reporthook(blocknum, bs, size)
1486 while 1:
1487 block = fp.read(bs)
1488 if not block:
1489 break
1490 read += len(block)
1491 tfp.write(block)
1492 blocknum += 1
1493 if reporthook:
1494 reporthook(blocknum, bs, size)
1495 fp.close()
1496 tfp.close()
1497 del fp
1498 del tfp
1499
1500 # raise exception if actual size does not match content-length header
1501 if size >= 0 and read < size:
1502 raise urllib.error.ContentTooShortError(
1503 "retrieval incomplete: got only %i out of %i bytes"
1504 % (read, size), result)
1505
1506 return result
1507
1508 # Each method named open_<type> knows how to open that type of URL
1509
1510 def _open_generic_http(self, connection_factory, url, data):
1511 """Make an HTTP connection using connection_class.
1512
1513 This is an internal method that should be called from
1514 open_http() or open_https().
1515
1516 Arguments:
1517 - connection_factory should take a host name and return an
1518 HTTPConnection instance.
1519 - url is the url to retrieval or a host, relative-path pair.
1520 - data is payload for a POST request or None.
1521 """
1522
1523 user_passwd = None
1524 proxy_passwd= None
1525 if isinstance(url, str):
1526 host, selector = urllib.parse.splithost(url)
1527 if host:
1528 user_passwd, host = urllib.parse.splituser(host)
1529 host = urllib.parse.unquote(host)
1530 realhost = host
1531 else:
1532 host, selector = url
1533 # check whether the proxy contains authorization information
1534 proxy_passwd, host = urllib.parse.splituser(host)
1535 # now we proceed with the url we want to obtain
1536 urltype, rest = urllib.parse.splittype(selector)
1537 url = rest
1538 user_passwd = None
1539 if urltype.lower() != 'http':
1540 realhost = None
1541 else:
1542 realhost, rest = urllib.parse.splithost(rest)
1543 if realhost:
1544 user_passwd, realhost = urllib.parse.splituser(realhost)
1545 if user_passwd:
1546 selector = "%s://%s%s" % (urltype, realhost, rest)
1547 if proxy_bypass(realhost):
1548 host = realhost
1549
1550 #print "proxy via http:", host, selector
1551 if not host: raise IOError('http error', 'no host given')
1552
1553 if proxy_passwd:
1554 import base64
1555 proxy_auth = base64.b64encode(proxy_passwd).strip()
1556 else:
1557 proxy_auth = None
1558
1559 if user_passwd:
1560 import base64
1561 auth = base64.b64encode(user_passwd).strip()
1562 else:
1563 auth = None
1564 http_conn = connection_factory(host)
1565 # XXX We should fix urllib so that it works with HTTP/1.1.
1566 http_conn._http_vsn = 10
1567 http_conn._http_vsn_str = "HTTP/1.0"
1568
1569 headers = {}
1570 if proxy_auth:
1571 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1572 if auth:
1573 headers["Authorization"] = "Basic %s" % auth
1574 if realhost:
1575 headers["Host"] = realhost
1576 for header, value in self.addheaders:
1577 headers[header] = value
1578
1579 if data is not None:
1580 headers["Content-Type"] = "application/x-www-form-urlencoded"
1581 http_conn.request("POST", selector, data, headers)
1582 else:
1583 http_conn.request("GET", selector, headers=headers)
1584
1585 try:
1586 response = http_conn.getresponse()
1587 except http.client.BadStatusLine:
1588 # something went wrong with the HTTP status line
1589 raise urllib.error.URLError("http protocol error: bad status line")
1590
1591 # According to RFC 2616, "2xx" code indicates that the client's
1592 # request was successfully received, understood, and accepted.
1593 if 200 <= response.status < 300:
1594 return urllib.response.addinfourl(response.fp, response.msg,
1595 "http:" + url,
1596 response.status)
1597 else:
1598 return self.http_error(
1599 url, response.fp,
1600 response.status, response.reason, response.msg, data)
1601
1602 def open_http(self, url, data=None):
1603 """Use HTTP protocol."""
1604 return self._open_generic_http(http.client.HTTPConnection, url, data)
1605
1606 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1607 """Handle http errors.
1608
1609 Derived class can override this, or provide specific handlers
1610 named http_error_DDD where DDD is the 3-digit error code."""
1611 # First check if there's a specific handler for this error
1612 name = 'http_error_%d' % errcode
1613 if hasattr(self, name):
1614 method = getattr(self, name)
1615 if data is None:
1616 result = method(url, fp, errcode, errmsg, headers)
1617 else:
1618 result = method(url, fp, errcode, errmsg, headers, data)
1619 if result: return result
1620 return self.http_error_default(url, fp, errcode, errmsg, headers)
1621
1622 def http_error_default(self, url, fp, errcode, errmsg, headers):
1623 """Default error handler: close the connection and raise IOError."""
1624 void = fp.read()
1625 fp.close()
1626 raise urllib.error.HTTPError(url, errcode, errmsg, headers, None)
1627
1628 if _have_ssl:
1629 def _https_connection(self, host):
1630 return http.client.HTTPSConnection(host,
1631 key_file=self.key_file,
1632 cert_file=self.cert_file)
1633
1634 def open_https(self, url, data=None):
1635 """Use HTTPS protocol."""
1636 return self._open_generic_http(self._https_connection, url, data)
1637
1638 def open_file(self, url):
1639 """Use local file or FTP depending on form of URL."""
1640 if not isinstance(url, str):
1641 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1642 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1643 return self.open_ftp(url)
1644 else:
1645 return self.open_local_file(url)
1646
1647 def open_local_file(self, url):
1648 """Use local file."""
1649 import mimetypes, email.utils
1650 from io import StringIO
1651 host, file = urllib.parse.splithost(url)
1652 localname = url2pathname(file)
1653 try:
1654 stats = os.stat(localname)
1655 except OSError as e:
1656 raise URLError(e.errno, e.strerror, e.filename)
1657 size = stats.st_size
1658 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1659 mtype = mimetypes.guess_type(url)[0]
1660 headers = email.message_from_string(
1661 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1662 (mtype or 'text/plain', size, modified))
1663 if not host:
1664 urlfile = file
1665 if file[:1] == '/':
1666 urlfile = 'file://' + file
1667 return urllib.response.addinfourl(open(localname, 'rb'),
1668 headers, urlfile)
1669 host, port = urllib.parse.splitport(host)
1670 if (not port
1671 and socket.gethostbyname(host) in (localhost(), thishost())):
1672 urlfile = file
1673 if file[:1] == '/':
1674 urlfile = 'file://' + file
1675 return urllib.response.addinfourl(open(localname, 'rb'),
1676 headers, urlfile)
1677 raise URLError('local file error', 'not on local host')
1678
1679 def open_ftp(self, url):
1680 """Use FTP protocol."""
1681 if not isinstance(url, str):
1682 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1683 import mimetypes
1684 from io import StringIO
1685 host, path = urllib.parse.splithost(url)
1686 if not host: raise URLError('ftp error', 'no host given')
1687 host, port = urllib.parse.splitport(host)
1688 user, host = urllib.parse.splituser(host)
1689 if user: user, passwd = urllib.parse.splitpasswd(user)
1690 else: passwd = None
1691 host = urllib.parse.unquote(host)
1692 user = urllib.parse.unquote(user or '')
1693 passwd = urllib.parse.unquote(passwd or '')
1694 host = socket.gethostbyname(host)
1695 if not port:
1696 import ftplib
1697 port = ftplib.FTP_PORT
1698 else:
1699 port = int(port)
1700 path, attrs = urllib.parse.splitattr(path)
1701 path = urllib.parse.unquote(path)
1702 dirs = path.split('/')
1703 dirs, file = dirs[:-1], dirs[-1]
1704 if dirs and not dirs[0]: dirs = dirs[1:]
1705 if dirs and not dirs[0]: dirs[0] = '/'
1706 key = user, host, port, '/'.join(dirs)
1707 # XXX thread unsafe!
1708 if len(self.ftpcache) > MAXFTPCACHE:
1709 # Prune the cache, rather arbitrarily
1710 for k in self.ftpcache.keys():
1711 if k != key:
1712 v = self.ftpcache[k]
1713 del self.ftpcache[k]
1714 v.close()
1715 try:
1716 if not key in self.ftpcache:
1717 self.ftpcache[key] = \
1718 ftpwrapper(user, passwd, host, port, dirs)
1719 if not file: type = 'D'
1720 else: type = 'I'
1721 for attr in attrs:
1722 attr, value = urllib.parse.splitvalue(attr)
1723 if attr.lower() == 'type' and \
1724 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1725 type = value.upper()
1726 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1727 mtype = mimetypes.guess_type("ftp:" + url)[0]
1728 headers = ""
1729 if mtype:
1730 headers += "Content-Type: %s\n" % mtype
1731 if retrlen is not None and retrlen >= 0:
1732 headers += "Content-Length: %d\n" % retrlen
1733 headers = email.message_from_string(headers)
1734 return urllib.response.addinfourl(fp, headers, "ftp:" + url)
1735 except ftperrors() as msg:
1736 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1737
1738 def open_data(self, url, data=None):
1739 """Use "data" URL."""
1740 if not isinstance(url, str):
1741 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1742 # ignore POSTed data
1743 #
1744 # syntax of data URLs:
1745 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1746 # mediatype := [ type "/" subtype ] *( ";" parameter )
1747 # data := *urlchar
1748 # parameter := attribute "=" value
1749 try:
1750 [type, data] = url.split(',', 1)
1751 except ValueError:
1752 raise IOError('data error', 'bad data URL')
1753 if not type:
1754 type = 'text/plain;charset=US-ASCII'
1755 semi = type.rfind(';')
1756 if semi >= 0 and '=' not in type[semi:]:
1757 encoding = type[semi+1:]
1758 type = type[:semi]
1759 else:
1760 encoding = ''
1761 msg = []
1762 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
1763 time.gmtime(time.time())))
1764 msg.append('Content-type: %s' % type)
1765 if encoding == 'base64':
1766 import base64
1767 data = base64.decodestring(data)
1768 else:
1769 data = urllib.parse.unquote(data)
1770 msg.append('Content-Length: %d' % len(data))
1771 msg.append('')
1772 msg.append(data)
1773 msg = '\n'.join(msg)
1774 headers = mimetools.message_from_string(msg)
1775 #f.fileno = None # needed for addinfourl
1776 return urllib.response.addinfourl(f, headers, url)
1777
1778
1779class FancyURLopener(URLopener):
1780 """Derived class with handlers for errors we can handle (perhaps)."""
1781
1782 def __init__(self, *args, **kwargs):
1783 URLopener.__init__(self, *args, **kwargs)
1784 self.auth_cache = {}
1785 self.tries = 0
1786 self.maxtries = 10
1787
1788 def http_error_default(self, url, fp, errcode, errmsg, headers):
1789 """Default error handling -- don't raise an exception."""
1790 return urllib.response.addinfourl(fp, headers, "http:" + url, errcode)
1791
1792 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1793 """Error 302 -- relocated (temporarily)."""
1794 self.tries += 1
1795 if self.maxtries and self.tries >= self.maxtries:
1796 if hasattr(self, "http_error_500"):
1797 meth = self.http_error_500
1798 else:
1799 meth = self.http_error_default
1800 self.tries = 0
1801 return meth(url, fp, 500,
1802 "Internal Server Error: Redirect Recursion", headers)
1803 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1804 data)
1805 self.tries = 0
1806 return result
1807
1808 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1809 if 'location' in headers:
1810 newurl = headers['location']
1811 elif 'uri' in headers:
1812 newurl = headers['uri']
1813 else:
1814 return
1815 void = fp.read()
1816 fp.close()
1817 # In case the server sent a relative URL, join with original:
1818 newurl = basejoin(self.type + ":" + url, newurl)
1819 return self.open(newurl)
1820
1821 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1822 """Error 301 -- also relocated (permanently)."""
1823 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1824
1825 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1826 """Error 303 -- also relocated (essentially identical to 302)."""
1827 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1828
1829 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1830 """Error 307 -- relocated, but turn POST into error."""
1831 if data is None:
1832 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1833 else:
1834 return self.http_error_default(url, fp, errcode, errmsg, headers)
1835
1836 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
1837 """Error 401 -- authentication required.
1838 This function supports Basic authentication only."""
1839 if not 'www-authenticate' in headers:
1840 URLopener.http_error_default(self, url, fp,
1841 errcode, errmsg, headers)
1842 stuff = headers['www-authenticate']
1843 import re
1844 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1845 if not match:
1846 URLopener.http_error_default(self, url, fp,
1847 errcode, errmsg, headers)
1848 scheme, realm = match.groups()
1849 if scheme.lower() != 'basic':
1850 URLopener.http_error_default(self, url, fp,
1851 errcode, errmsg, headers)
1852 name = 'retry_' + self.type + '_basic_auth'
1853 if data is None:
1854 return getattr(self,name)(url, realm)
1855 else:
1856 return getattr(self,name)(url, realm, data)
1857
1858 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
1859 """Error 407 -- proxy authentication required.
1860 This function supports Basic authentication only."""
1861 if not 'proxy-authenticate' in headers:
1862 URLopener.http_error_default(self, url, fp,
1863 errcode, errmsg, headers)
1864 stuff = headers['proxy-authenticate']
1865 import re
1866 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1867 if not match:
1868 URLopener.http_error_default(self, url, fp,
1869 errcode, errmsg, headers)
1870 scheme, realm = match.groups()
1871 if scheme.lower() != 'basic':
1872 URLopener.http_error_default(self, url, fp,
1873 errcode, errmsg, headers)
1874 name = 'retry_proxy_' + self.type + '_basic_auth'
1875 if data is None:
1876 return getattr(self,name)(url, realm)
1877 else:
1878 return getattr(self,name)(url, realm, data)
1879
1880 def retry_proxy_http_basic_auth(self, url, realm, data=None):
1881 host, selector = urllib.parse.splithost(url)
1882 newurl = 'http://' + host + selector
1883 proxy = self.proxies['http']
1884 urltype, proxyhost = urllib.parse.splittype(proxy)
1885 proxyhost, proxyselector = urllib.parse.splithost(proxyhost)
1886 i = proxyhost.find('@') + 1
1887 proxyhost = proxyhost[i:]
1888 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1889 if not (user or passwd): return None
1890 proxyhost = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1891 quote(passwd, safe=''), proxyhost)
1892 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1893 if data is None:
1894 return self.open(newurl)
1895 else:
1896 return self.open(newurl, data)
1897
1898 def retry_proxy_https_basic_auth(self, url, realm, data=None):
1899 host, selector = urllib.parse.splithost(url)
1900 newurl = 'https://' + host + selector
1901 proxy = self.proxies['https']
1902 urltype, proxyhost = urllib.parse.splittype(proxy)
1903 proxyhost, proxyselector = urllib.parse.splithost(proxyhost)
1904 i = proxyhost.find('@') + 1
1905 proxyhost = proxyhost[i:]
1906 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1907 if not (user or passwd): return None
1908 proxyhost = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1909 quote(passwd, safe=''), proxyhost)
1910 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1911 if data is None:
1912 return self.open(newurl)
1913 else:
1914 return self.open(newurl, data)
1915
1916 def retry_http_basic_auth(self, url, realm, data=None):
1917 host, selector = urllib.parse.splithost(url)
1918 i = host.find('@') + 1
1919 host = host[i:]
1920 user, passwd = self.get_user_passwd(host, realm, i)
1921 if not (user or passwd): return None
1922 host = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1923 quote(passwd, safe=''), host)
1924 newurl = 'http://' + host + selector
1925 if data is None:
1926 return self.open(newurl)
1927 else:
1928 return self.open(newurl, data)
1929
1930 def retry_https_basic_auth(self, url, realm, data=None):
1931 host, selector = urllib.parse.splithost(url)
1932 i = host.find('@') + 1
1933 host = host[i:]
1934 user, passwd = self.get_user_passwd(host, realm, i)
1935 if not (user or passwd): return None
1936 host = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1937 quote(passwd, safe=''), host)
1938 newurl = 'https://' + host + selector
1939 if data is None:
1940 return self.open(newurl)
1941 else:
1942 return self.open(newurl, data)
1943
1944 def get_user_passwd(self, host, realm, clear_cache = 0):
1945 key = realm + '@' + host.lower()
1946 if key in self.auth_cache:
1947 if clear_cache:
1948 del self.auth_cache[key]
1949 else:
1950 return self.auth_cache[key]
1951 user, passwd = self.prompt_user_passwd(host, realm)
1952 if user or passwd: self.auth_cache[key] = (user, passwd)
1953 return user, passwd
1954
1955 def prompt_user_passwd(self, host, realm):
1956 """Override this in a GUI environment!"""
1957 import getpass
1958 try:
1959 user = input("Enter username for %s at %s: " % (realm, host))
1960 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
1961 (user, realm, host))
1962 return user, passwd
1963 except KeyboardInterrupt:
1964 print()
1965 return None, None
1966
1967
1968# Utility functions
1969
1970_localhost = None
1971def localhost():
1972 """Return the IP address of the magic hostname 'localhost'."""
1973 global _localhost
1974 if _localhost is None:
1975 _localhost = socket.gethostbyname('localhost')
1976 return _localhost
1977
1978_thishost = None
1979def thishost():
1980 """Return the IP address of the current host."""
1981 global _thishost
1982 if _thishost is None:
1983 _thishost = socket.gethostbyname(socket.gethostname())
1984 return _thishost
1985
1986_ftperrors = None
1987def ftperrors():
1988 """Return the set of errors raised by the FTP class."""
1989 global _ftperrors
1990 if _ftperrors is None:
1991 import ftplib
1992 _ftperrors = ftplib.all_errors
1993 return _ftperrors
1994
1995_noheaders = None
1996def noheaders():
1997 """Return an empty mimetools.Message object."""
1998 global _noheaders
1999 if _noheaders is None:
2000 _noheaders = mimetools.message_from_string("")
2001 return _noheaders
2002
2003
2004# Utility classes
2005
2006class ftpwrapper:
2007 """Class used by open_ftp() for cache of open FTP connections."""
2008
2009 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2010 self.user = user
2011 self.passwd = passwd
2012 self.host = host
2013 self.port = port
2014 self.dirs = dirs
2015 self.timeout = timeout
2016 self.init()
2017
2018 def init(self):
2019 import ftplib
2020 self.busy = 0
2021 self.ftp = ftplib.FTP()
2022 self.ftp.connect(self.host, self.port, self.timeout)
2023 self.ftp.login(self.user, self.passwd)
2024 for dir in self.dirs:
2025 self.ftp.cwd(dir)
2026
2027 def retrfile(self, file, type):
2028 import ftplib
2029 self.endtransfer()
2030 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2031 else: cmd = 'TYPE ' + type; isdir = 0
2032 try:
2033 self.ftp.voidcmd(cmd)
2034 except ftplib.all_errors:
2035 self.init()
2036 self.ftp.voidcmd(cmd)
2037 conn = None
2038 if file and not isdir:
2039 # Try to retrieve as a file
2040 try:
2041 cmd = 'RETR ' + file
2042 conn = self.ftp.ntransfercmd(cmd)
2043 except ftplib.error_perm as reason:
2044 if str(reason)[:3] != '550':
2045 raise urllib.error.URLError('ftp error', reason).with_traceback(sys.exc_info()[2])
2046 if not conn:
2047 # Set transfer mode to ASCII!
2048 self.ftp.voidcmd('TYPE A')
2049 # Try a directory listing. Verify that directory exists.
2050 if file:
2051 pwd = self.ftp.pwd()
2052 try:
2053 try:
2054 self.ftp.cwd(file)
2055 except ftplib.error_perm as reason:
2056 raise urllib.error.URLError('ftp error', reason) from reason
2057 finally:
2058 self.ftp.cwd(pwd)
2059 cmd = 'LIST ' + file
2060 else:
2061 cmd = 'LIST'
2062 conn = self.ftp.ntransfercmd(cmd)
2063 self.busy = 1
2064 # Pass back both a suitably decorated object and a retrieval length
2065 return (urllib.response.addclosehook(conn[0].makefile('rb'),
2066 self.endtransfer), conn[1])
2067 def endtransfer(self):
2068 if not self.busy:
2069 return
2070 self.busy = 0
2071 try:
2072 self.ftp.voidresp()
2073 except ftperrors():
2074 pass
2075
2076 def close(self):
2077 self.endtransfer()
2078 try:
2079 self.ftp.close()
2080 except ftperrors():
2081 pass
2082
2083# Proxy handling
2084def getproxies_environment():
2085 """Return a dictionary of scheme -> proxy server URL mappings.
2086
2087 Scan the environment for variables named <scheme>_proxy;
2088 this seems to be the standard convention. If you need a
2089 different way, you can pass a proxies dictionary to the
2090 [Fancy]URLopener constructor.
2091
2092 """
2093 proxies = {}
2094 for name, value in os.environ.items():
2095 name = name.lower()
2096 if name == 'no_proxy':
2097 # handled in proxy_bypass_environment
2098 continue
2099 if value and name[-6:] == '_proxy':
2100 proxies[name[:-6]] = value
2101 return proxies
2102
2103def proxy_bypass_environment(host):
2104 """Test if proxies should not be used for a particular host.
2105
2106 Checks the environment for a variable named no_proxy, which should
2107 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2108 """
2109 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2110 # '*' is special case for always bypass
2111 if no_proxy == '*':
2112 return 1
2113 # strip port off host
2114 hostonly, port = urllib.parse.splitport(host)
2115 # check if the host ends with any of the DNS suffixes
2116 for name in no_proxy.split(','):
2117 if name and (hostonly.endswith(name) or host.endswith(name)):
2118 return 1
2119 # otherwise, don't bypass
2120 return 0
2121
2122
2123if sys.platform == 'darwin':
2124 def getproxies_internetconfig():
2125 """Return a dictionary of scheme -> proxy server URL mappings.
2126
2127 By convention the mac uses Internet Config to store
2128 proxies. An HTTP proxy, for instance, is stored under
2129 the HttpProxy key.
2130
2131 """
2132 try:
2133 import ic
2134 except ImportError:
2135 return {}
2136
2137 try:
2138 config = ic.IC()
2139 except ic.error:
2140 return {}
2141 proxies = {}
2142 # HTTP:
2143 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
2144 try:
2145 value = config['HTTPProxyHost']
2146 except ic.error:
2147 pass
2148 else:
2149 proxies['http'] = 'http://%s' % value
2150 # FTP: XXX To be done.
2151 # Gopher: XXX To be done.
2152 return proxies
2153
2154 def proxy_bypass(host):
2155 if getproxies_environment():
2156 return proxy_bypass_environment(host)
2157 else:
2158 return 0
2159
2160 def getproxies():
2161 return getproxies_environment() or getproxies_internetconfig()
2162
2163elif os.name == 'nt':
2164 def getproxies_registry():
2165 """Return a dictionary of scheme -> proxy server URL mappings.
2166
2167 Win32 uses the registry to store proxies.
2168
2169 """
2170 proxies = {}
2171 try:
2172 import _winreg
2173 except ImportError:
2174 # Std module, so should be around - but you never know!
2175 return proxies
2176 try:
2177 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2178 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2179 proxyEnable = _winreg.QueryValueEx(internetSettings,
2180 'ProxyEnable')[0]
2181 if proxyEnable:
2182 # Returned as Unicode but problems if not converted to ASCII
2183 proxyServer = str(_winreg.QueryValueEx(internetSettings,
2184 'ProxyServer')[0])
2185 if '=' in proxyServer:
2186 # Per-protocol settings
2187 for p in proxyServer.split(';'):
2188 protocol, address = p.split('=', 1)
2189 # See if address has a type:// prefix
2190 import re
2191 if not re.match('^([^/:]+)://', address):
2192 address = '%s://%s' % (protocol, address)
2193 proxies[protocol] = address
2194 else:
2195 # Use one setting for all protocols
2196 if proxyServer[:5] == 'http:':
2197 proxies['http'] = proxyServer
2198 else:
2199 proxies['http'] = 'http://%s' % proxyServer
2200 proxies['ftp'] = 'ftp://%s' % proxyServer
2201 internetSettings.Close()
2202 except (WindowsError, ValueError, TypeError):
2203 # Either registry key not found etc, or the value in an
2204 # unexpected format.
2205 # proxies already set up to be empty so nothing to do
2206 pass
2207 return proxies
2208
2209 def getproxies():
2210 """Return a dictionary of scheme -> proxy server URL mappings.
2211
2212 Returns settings gathered from the environment, if specified,
2213 or the registry.
2214
2215 """
2216 return getproxies_environment() or getproxies_registry()
2217
2218 def proxy_bypass_registry(host):
2219 try:
2220 import _winreg
2221 import re
2222 except ImportError:
2223 # Std modules, so should be around - but you never know!
2224 return 0
2225 try:
2226 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2227 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2228 proxyEnable = _winreg.QueryValueEx(internetSettings,
2229 'ProxyEnable')[0]
2230 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
2231 'ProxyOverride')[0])
2232 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2233 except WindowsError:
2234 return 0
2235 if not proxyEnable or not proxyOverride:
2236 return 0
2237 # try to make a host list from name and IP address.
2238 rawHost, port = urllib.parse.splitport(host)
2239 host = [rawHost]
2240 try:
2241 addr = socket.gethostbyname(rawHost)
2242 if addr != rawHost:
2243 host.append(addr)
2244 except socket.error:
2245 pass
2246 try:
2247 fqdn = socket.getfqdn(rawHost)
2248 if fqdn != rawHost:
2249 host.append(fqdn)
2250 except socket.error:
2251 pass
2252 # make a check value list from the registry entry: replace the
2253 # '<local>' string by the localhost entry and the corresponding
2254 # canonical entry.
2255 proxyOverride = proxyOverride.split(';')
2256 i = 0
2257 while i < len(proxyOverride):
2258 if proxyOverride[i] == '<local>':
2259 proxyOverride[i:i+1] = ['localhost',
2260 '127.0.0.1',
2261 socket.gethostname(),
2262 socket.gethostbyname(
2263 socket.gethostname())]
2264 i += 1
2265 # print proxyOverride
2266 # now check if we match one of the registry values.
2267 for test in proxyOverride:
2268 test = test.replace(".", r"\.") # mask dots
2269 test = test.replace("*", r".*") # change glob sequence
2270 test = test.replace("?", r".") # change glob char
2271 for val in host:
2272 # print "%s <--> %s" %( test, val )
2273 if re.match(test, val, re.I):
2274 return 1
2275 return 0
2276
2277 def proxy_bypass(host):
2278 """Return a dictionary of scheme -> proxy server URL mappings.
2279
2280 Returns settings gathered from the environment, if specified,
2281 or the registry.
2282
2283 """
2284 if getproxies_environment():
2285 return proxy_bypass_environment(host)
2286 else:
2287 return proxy_bypass_registry(host)
2288
2289else:
2290 # By default use environment variables
2291 getproxies = getproxies_environment
2292 proxy_bypass = proxy_bypass_environment