blob: cd4729a95e85c8828ce87c2854e9036014b7332a [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001# Issues in merging urllib and urllib2:
2# 1. They both define a function named urlopen()
3
4"""An extensible library for opening URLs using a variety of protocols
5
6The simplest way to use this module is to call the urlopen function,
7which accepts a string containing a URL or a Request object (described
8below). It opens the URL and returns the results as file-like
9object; the returned object has some extra methods described below.
10
11The OpenerDirector manages a collection of Handler objects that do
12all the actual work. Each Handler implements a particular protocol or
13option. The OpenerDirector is a composite object that invokes the
14Handlers needed to open the requested URL. For example, the
15HTTPHandler performs HTTP GET and POST requests and deals with
16non-error returns. The HTTPRedirectHandler automatically deals with
17HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
18deals with digest authentication.
19
20urlopen(url, data=None) -- Basic usage is the same as original
21urllib. pass the url and optionally data to post to an HTTP URL, and
22get a file-like object back. One difference is that you can also pass
23a Request instance instead of URL. Raises a URLError (subclass of
24IOError); for HTTP errors, raises an HTTPError, which can also be
25treated as a valid response.
26
27build_opener -- Function that creates a new OpenerDirector instance.
28Will install the default handlers. Accepts one or more Handlers as
29arguments, either instances or Handler classes that it will
30instantiate. If one of the argument is a subclass of the default
31handler, the argument will be installed instead of the default.
32
33install_opener -- Installs a new opener as the default opener.
34
35objects of interest:
36OpenerDirector --
37
38Request -- An object that encapsulates the state of a request. The
39state can be as simple as the URL. It can also include extra HTTP
40headers, e.g. a User-Agent.
41
42BaseHandler --
43
44internals:
45BaseHandler and parent
46_call_chain conventions
47
48Example usage:
49
50import urllib2
51
52# set up authentication info
53authinfo = urllib2.HTTPBasicAuthHandler()
54authinfo.add_password(realm='PDQ Application',
55 uri='https://mahler:8092/site-updates.py',
56 user='klem',
57 passwd='geheim$parole')
58
59proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
60
61# build a new opener that adds authentication and caching FTP handlers
62opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
63
64# install it
65urllib2.install_opener(opener)
66
67f = urllib2.urlopen('http://www.python.org/')
68"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
85import email
86import hashlib
87import http.client
88import io
89import os
90import posixpath
91import random
92import re
93import socket
94import sys
95import time
96import urllib.parse, urllib.error, urllib.response
97import bisect
98
99from io import StringIO
100
101# check for SSL
102try:
103 import ssl
104except:
105 _have_ssl = False
106else:
107 _have_ssl = True
108assert _have_ssl
109
110# used in User-Agent header sent
111__version__ = sys.version[:3]
112
113_opener = None
114def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
115 global _opener
116 if _opener is None:
117 _opener = build_opener()
118 return _opener.open(url, data, timeout)
119
120def install_opener(opener):
121 global _opener
122 _opener = opener
123
124# TODO(jhylton): Make this work with the same global opener.
125_urlopener = None
126def urlretrieve(url, filename=None, reporthook=None, data=None):
127 global _urlopener
128 if not _urlopener:
129 _urlopener = FancyURLopener()
130 return _urlopener.retrieve(url, filename, reporthook, data)
131
132def urlcleanup():
133 if _urlopener:
134 _urlopener.cleanup()
135 global _opener
136 if _opener:
137 _opener = None
138
139# copied from cookielib.py
140_cut_port_re = re.compile(r":\d+$")
141def request_host(request):
142 """Return request-host, as defined by RFC 2965.
143
144 Variation from RFC: returned value is lowercased, for convenient
145 comparison.
146
147 """
148 url = request.get_full_url()
149 host = urllib.parse.urlparse(url)[1]
150 if host == "":
151 host = request.get_header("Host", "")
152
153 # remove port, if present
154 host = _cut_port_re.sub("", host, 1)
155 return host.lower()
156
157class Request:
158
159 def __init__(self, url, data=None, headers={},
160 origin_req_host=None, unverifiable=False):
161 # unwrap('<URL:type://host/path>') --> 'type://host/path'
162 self.__original = urllib.parse.unwrap(url)
163 self.type = None
164 # self.__r_type is what's left after doing the splittype
165 self.host = None
166 self.port = None
167 self.data = data
168 self.headers = {}
169 for key, value in headers.items():
170 self.add_header(key, value)
171 self.unredirected_hdrs = {}
172 if origin_req_host is None:
173 origin_req_host = request_host(self)
174 self.origin_req_host = origin_req_host
175 self.unverifiable = unverifiable
176
177 def __getattr__(self, attr):
178 # XXX this is a fallback mechanism to guard against these
179 # methods getting called in a non-standard order. this may be
180 # too complicated and/or unnecessary.
181 # XXX should the __r_XXX attributes be public?
182 if attr[:12] == '_Request__r_':
183 name = attr[12:]
184 if hasattr(Request, 'get_' + name):
185 getattr(self, 'get_' + name)()
186 return getattr(self, attr)
187 raise AttributeError(attr)
188
189 def get_method(self):
190 if self.has_data():
191 return "POST"
192 else:
193 return "GET"
194
195 # XXX these helper methods are lame
196
197 def add_data(self, data):
198 self.data = data
199
200 def has_data(self):
201 return self.data is not None
202
203 def get_data(self):
204 return self.data
205
206 def get_full_url(self):
207 return self.__original
208
209 def get_type(self):
210 if self.type is None:
211 self.type, self.__r_type = urllib.parse.splittype(self.__original)
212 if self.type is None:
213 raise ValueError("unknown url type: %s" % self.__original)
214 return self.type
215
216 def get_host(self):
217 if self.host is None:
218 self.host, self.__r_host = urllib.parse.splithost(self.__r_type)
219 if self.host:
220 self.host = urllib.parse.unquote(self.host)
221 return self.host
222
223 def get_selector(self):
224 return self.__r_host
225
226 def set_proxy(self, host, type):
227 self.host, self.type = host, type
228 self.__r_host = self.__original
229
230 def get_origin_req_host(self):
231 return self.origin_req_host
232
233 def is_unverifiable(self):
234 return self.unverifiable
235
236 def add_header(self, key, val):
237 # useful for something like authentication
238 self.headers[key.capitalize()] = val
239
240 def add_unredirected_header(self, key, val):
241 # will not be added to a redirected request
242 self.unredirected_hdrs[key.capitalize()] = val
243
244 def has_header(self, header_name):
245 return (header_name in self.headers or
246 header_name in self.unredirected_hdrs)
247
248 def get_header(self, header_name, default=None):
249 return self.headers.get(
250 header_name,
251 self.unredirected_hdrs.get(header_name, default))
252
253 def header_items(self):
254 hdrs = self.unredirected_hdrs.copy()
255 hdrs.update(self.headers)
256 return list(hdrs.items())
257
258class OpenerDirector:
259 def __init__(self):
260 client_version = "Python-urllib/%s" % __version__
261 self.addheaders = [('User-agent', client_version)]
262 # manage the individual handlers
263 self.handlers = []
264 self.handle_open = {}
265 self.handle_error = {}
266 self.process_response = {}
267 self.process_request = {}
268
269 def add_handler(self, handler):
270 if not hasattr(handler, "add_parent"):
271 raise TypeError("expected BaseHandler instance, got %r" %
272 type(handler))
273
274 added = False
275 for meth in dir(handler):
276 if meth in ["redirect_request", "do_open", "proxy_open"]:
277 # oops, coincidental match
278 continue
279
280 i = meth.find("_")
281 protocol = meth[:i]
282 condition = meth[i+1:]
283
284 if condition.startswith("error"):
285 j = condition.find("_") + i + 1
286 kind = meth[j+1:]
287 try:
288 kind = int(kind)
289 except ValueError:
290 pass
291 lookup = self.handle_error.get(protocol, {})
292 self.handle_error[protocol] = lookup
293 elif condition == "open":
294 kind = protocol
295 lookup = self.handle_open
296 elif condition == "response":
297 kind = protocol
298 lookup = self.process_response
299 elif condition == "request":
300 kind = protocol
301 lookup = self.process_request
302 else:
303 continue
304
305 handlers = lookup.setdefault(kind, [])
306 if handlers:
307 bisect.insort(handlers, handler)
308 else:
309 handlers.append(handler)
310 added = True
311
312 if added:
313 # the handlers must work in an specific order, the order
314 # is specified in a Handler attribute
315 bisect.insort(self.handlers, handler)
316 handler.add_parent(self)
317
318 def close(self):
319 # Only exists for backwards compatibility.
320 pass
321
322 def _call_chain(self, chain, kind, meth_name, *args):
323 # Handlers raise an exception if no one else should try to handle
324 # the request, or return None if they can't but another handler
325 # could. Otherwise, they return the response.
326 handlers = chain.get(kind, ())
327 for handler in handlers:
328 func = getattr(handler, meth_name)
329
330 result = func(*args)
331 if result is not None:
332 return result
333
334 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
335 # accept a URL or a Request object
336 if isinstance(fullurl, str):
337 req = Request(fullurl, data)
338 else:
339 req = fullurl
340 if data is not None:
341 req.add_data(data)
342
343 req.timeout = timeout
344 protocol = req.get_type()
345
346 # pre-process request
347 meth_name = protocol+"_request"
348 for processor in self.process_request.get(protocol, []):
349 meth = getattr(processor, meth_name)
350 req = meth(req)
351
352 response = self._open(req, data)
353
354 # post-process response
355 meth_name = protocol+"_response"
356 for processor in self.process_response.get(protocol, []):
357 meth = getattr(processor, meth_name)
358 response = meth(req, response)
359
360 return response
361
362 def _open(self, req, data=None):
363 result = self._call_chain(self.handle_open, 'default',
364 'default_open', req)
365 if result:
366 return result
367
368 protocol = req.get_type()
369 result = self._call_chain(self.handle_open, protocol, protocol +
370 '_open', req)
371 if result:
372 return result
373
374 return self._call_chain(self.handle_open, 'unknown',
375 'unknown_open', req)
376
377 def error(self, proto, *args):
378 if proto in ('http', 'https'):
379 # XXX http[s] protocols are special-cased
380 dict = self.handle_error['http'] # https is not different than http
381 proto = args[2] # YUCK!
382 meth_name = 'http_error_%s' % proto
383 http_err = 1
384 orig_args = args
385 else:
386 dict = self.handle_error
387 meth_name = proto + '_error'
388 http_err = 0
389 args = (dict, proto, meth_name) + args
390 result = self._call_chain(*args)
391 if result:
392 return result
393
394 if http_err:
395 args = (dict, 'default', 'http_error_default') + orig_args
396 return self._call_chain(*args)
397
398# XXX probably also want an abstract factory that knows when it makes
399# sense to skip a superclass in favor of a subclass and when it might
400# make sense to include both
401
402def build_opener(*handlers):
403 """Create an opener object from a list of handlers.
404
405 The opener will use several default handlers, including support
406 for HTTP and FTP.
407
408 If any of the handlers passed as arguments are subclasses of the
409 default handlers, the default handlers will not be used.
410 """
411 def isclass(obj):
412 return isinstance(obj, type) or hasattr(obj, "__bases__")
413
414 opener = OpenerDirector()
415 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
416 HTTPDefaultErrorHandler, HTTPRedirectHandler,
417 FTPHandler, FileHandler, HTTPErrorProcessor]
418 if hasattr(http.client, "HTTPSConnection"):
419 default_classes.append(HTTPSHandler)
420 else:
421 import pdb; pdb.set_trace()
422 skip = set()
423 for klass in default_classes:
424 for check in handlers:
425 if isclass(check):
426 if issubclass(check, klass):
427 skip.add(klass)
428 elif isinstance(check, klass):
429 skip.add(klass)
430 for klass in skip:
431 default_classes.remove(klass)
432
433 for klass in default_classes:
434 opener.add_handler(klass())
435
436 for h in handlers:
437 if isclass(h):
438 h = h()
439 opener.add_handler(h)
440 return opener
441
442class BaseHandler:
443 handler_order = 500
444
445 def add_parent(self, parent):
446 self.parent = parent
447
448 def close(self):
449 # Only exists for backwards compatibility
450 pass
451
452 def __lt__(self, other):
453 if not hasattr(other, "handler_order"):
454 # Try to preserve the old behavior of having custom classes
455 # inserted after default ones (works only for custom user
456 # classes which are not aware of handler_order).
457 return True
458 return self.handler_order < other.handler_order
459
460
461class HTTPErrorProcessor(BaseHandler):
462 """Process HTTP error responses."""
463 handler_order = 1000 # after all other processing
464
465 def http_response(self, request, response):
466 code, msg, hdrs = response.code, response.msg, response.info()
467
468 # According to RFC 2616, "2xx" code indicates that the client's
469 # request was successfully received, understood, and accepted.
470 if not (200 <= code < 300):
471 response = self.parent.error(
472 'http', request, response, code, msg, hdrs)
473
474 return response
475
476 https_response = http_response
477
478class HTTPDefaultErrorHandler(BaseHandler):
479 def http_error_default(self, req, fp, code, msg, hdrs):
480 raise urllib.error.HTTPError(req.get_full_url(), code, msg, hdrs, fp)
481
482class HTTPRedirectHandler(BaseHandler):
483 # maximum number of redirections to any single URL
484 # this is needed because of the state that cookies introduce
485 max_repeats = 4
486 # maximum total number of redirections (regardless of URL) before
487 # assuming we're in a loop
488 max_redirections = 10
489
490 def redirect_request(self, req, fp, code, msg, headers, newurl):
491 """Return a Request or None in response to a redirect.
492
493 This is called by the http_error_30x methods when a
494 redirection response is received. If a redirection should
495 take place, return a new Request to allow http_error_30x to
496 perform the redirect. Otherwise, raise HTTPError if no-one
497 else should try to handle this url. Return None if you can't
498 but another Handler might.
499 """
500 m = req.get_method()
501 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
502 or code in (301, 302, 303) and m == "POST")):
503 raise urllib.error.HTTPError(req.get_full_url(),
504 code, msg, headers, fp)
505
506 # Strictly (according to RFC 2616), 301 or 302 in response to
507 # a POST MUST NOT cause a redirection without confirmation
508 # from the user (of urllib2, in this case). In practice,
509 # essentially all clients do redirect in this case, so we do
510 # the same.
511 # be conciliant with URIs containing a space
512 newurl = newurl.replace(' ', '%20')
513 CONTENT_HEADERS = ("content-length", "content-type")
514 newheaders = dict((k, v) for k, v in req.headers.items()
515 if k.lower() not in CONTENT_HEADERS)
516 return Request(newurl,
517 headers=newheaders,
518 origin_req_host=req.get_origin_req_host(),
519 unverifiable=True)
520
521 # Implementation note: To avoid the server sending us into an
522 # infinite loop, the request object needs to track what URLs we
523 # have already seen. Do this by adding a handler-specific
524 # attribute to the Request object.
525 def http_error_302(self, req, fp, code, msg, headers):
526 # Some servers (incorrectly) return multiple Location headers
527 # (so probably same goes for URI). Use first header.
528 if "location" in headers:
529 newurl = headers["location"]
530 elif "uri" in headers:
531 newurl = headers["uri"]
532 else:
533 return
534 newurl = urllib.parse.urljoin(req.get_full_url(), newurl)
535
536 # XXX Probably want to forget about the state of the current
537 # request, although that might interact poorly with other
538 # handlers that also use handler-specific request attributes
539 new = self.redirect_request(req, fp, code, msg, headers, newurl)
540 if new is None:
541 return
542
543 # loop detection
544 # .redirect_dict has a key url if url was previously visited.
545 if hasattr(req, 'redirect_dict'):
546 visited = new.redirect_dict = req.redirect_dict
547 if (visited.get(newurl, 0) >= self.max_repeats or
548 len(visited) >= self.max_redirections):
549 raise urllib.error.HTTPError(req.get_full_url(), code,
550 self.inf_msg + msg, headers, fp)
551 else:
552 visited = new.redirect_dict = req.redirect_dict = {}
553 visited[newurl] = visited.get(newurl, 0) + 1
554
555 # Don't close the fp until we are sure that we won't use it
556 # with HTTPError.
557 fp.read()
558 fp.close()
559
560 return self.parent.open(new)
561
562 http_error_301 = http_error_303 = http_error_307 = http_error_302
563
564 inf_msg = "The HTTP server returned a redirect error that would " \
565 "lead to an infinite loop.\n" \
566 "The last 30x error message was:\n"
567
568
569def _parse_proxy(proxy):
570 """Return (scheme, user, password, host/port) given a URL or an authority.
571
572 If a URL is supplied, it must have an authority (host:port) component.
573 According to RFC 3986, having an authority component means the URL must
574 have two slashes after the scheme:
575
576 >>> _parse_proxy('file:/ftp.example.com/')
577 Traceback (most recent call last):
578 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
579
580 The first three items of the returned tuple may be None.
581
582 Examples of authority parsing:
583
584 >>> _parse_proxy('proxy.example.com')
585 (None, None, None, 'proxy.example.com')
586 >>> _parse_proxy('proxy.example.com:3128')
587 (None, None, None, 'proxy.example.com:3128')
588
589 The authority component may optionally include userinfo (assumed to be
590 username:password):
591
592 >>> _parse_proxy('joe:password@proxy.example.com')
593 (None, 'joe', 'password', 'proxy.example.com')
594 >>> _parse_proxy('joe:password@proxy.example.com:3128')
595 (None, 'joe', 'password', 'proxy.example.com:3128')
596
597 Same examples, but with URLs instead:
598
599 >>> _parse_proxy('http://proxy.example.com/')
600 ('http', None, None, 'proxy.example.com')
601 >>> _parse_proxy('http://proxy.example.com:3128/')
602 ('http', None, None, 'proxy.example.com:3128')
603 >>> _parse_proxy('http://joe:password@proxy.example.com/')
604 ('http', 'joe', 'password', 'proxy.example.com')
605 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
606 ('http', 'joe', 'password', 'proxy.example.com:3128')
607
608 Everything after the authority is ignored:
609
610 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
611 ('ftp', 'joe', 'password', 'proxy.example.com')
612
613 Test for no trailing '/' case:
614
615 >>> _parse_proxy('http://joe:password@proxy.example.com')
616 ('http', 'joe', 'password', 'proxy.example.com')
617
618 """
619 scheme, r_scheme = urllib.parse.splittype(proxy)
620 if not r_scheme.startswith("/"):
621 # authority
622 scheme = None
623 authority = proxy
624 else:
625 # URL
626 if not r_scheme.startswith("//"):
627 raise ValueError("proxy URL with no authority: %r" % proxy)
628 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
629 # and 3.3.), path is empty or starts with '/'
630 end = r_scheme.find("/", 2)
631 if end == -1:
632 end = None
633 authority = r_scheme[2:end]
634 userinfo, hostport = urllib.parse.splituser(authority)
635 if userinfo is not None:
636 user, password = urllib.parse.splitpasswd(userinfo)
637 else:
638 user = password = None
639 return scheme, user, password, hostport
640
641class ProxyHandler(BaseHandler):
642 # Proxies must be in front
643 handler_order = 100
644
645 def __init__(self, proxies=None):
646 if proxies is None:
647 proxies = getproxies()
648 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
649 self.proxies = proxies
650 for type, url in proxies.items():
651 setattr(self, '%s_open' % type,
652 lambda r, proxy=url, type=type, meth=self.proxy_open: \
653 meth(r, proxy, type))
654
655 def proxy_open(self, req, proxy, type):
656 orig_type = req.get_type()
657 proxy_type, user, password, hostport = _parse_proxy(proxy)
658 if proxy_type is None:
659 proxy_type = orig_type
660 if user and password:
661 user_pass = '%s:%s' % (unquote(user),
662 urllib.parse.unquote(password))
663 creds = base64.b64encode(user_pass.encode()).decode("ascii")
664 req.add_header('Proxy-authorization', 'Basic ' + creds)
665 hostport = urllib.parse.unquote(hostport)
666 req.set_proxy(hostport, proxy_type)
667 if orig_type == proxy_type:
668 # let other handlers take care of it
669 return None
670 else:
671 # need to start over, because the other handlers don't
672 # grok the proxy's URL type
673 # e.g. if we have a constructor arg proxies like so:
674 # {'http': 'ftp://proxy.example.com'}, we may end up turning
675 # a request for http://acme.example.com/a into one for
676 # ftp://proxy.example.com/a
677 return self.parent.open(req)
678
679class HTTPPasswordMgr:
680
681 def __init__(self):
682 self.passwd = {}
683
684 def add_password(self, realm, uri, user, passwd):
685 # uri could be a single URI or a sequence
686 if isinstance(uri, str):
687 uri = [uri]
688 if not realm in self.passwd:
689 self.passwd[realm] = {}
690 for default_port in True, False:
691 reduced_uri = tuple(
692 [self.reduce_uri(u, default_port) for u in uri])
693 self.passwd[realm][reduced_uri] = (user, passwd)
694
695 def find_user_password(self, realm, authuri):
696 domains = self.passwd.get(realm, {})
697 for default_port in True, False:
698 reduced_authuri = self.reduce_uri(authuri, default_port)
699 for uris, authinfo in domains.items():
700 for uri in uris:
701 if self.is_suburi(uri, reduced_authuri):
702 return authinfo
703 return None, None
704
705 def reduce_uri(self, uri, default_port=True):
706 """Accept authority or URI and extract only the authority and path."""
707 # note HTTP URLs do not have a userinfo component
708 parts = urllib.parse.urlsplit(uri)
709 if parts[1]:
710 # URI
711 scheme = parts[0]
712 authority = parts[1]
713 path = parts[2] or '/'
714 else:
715 # host or host:port
716 scheme = None
717 authority = uri
718 path = '/'
719 host, port = urllib.parse.splitport(authority)
720 if default_port and port is None and scheme is not None:
721 dport = {"http": 80,
722 "https": 443,
723 }.get(scheme)
724 if dport is not None:
725 authority = "%s:%d" % (host, dport)
726 return authority, path
727
728 def is_suburi(self, base, test):
729 """Check if test is below base in a URI tree
730
731 Both args must be URIs in reduced form.
732 """
733 if base == test:
734 return True
735 if base[0] != test[0]:
736 return False
737 common = posixpath.commonprefix((base[1], test[1]))
738 if len(common) == len(base[1]):
739 return True
740 return False
741
742
743class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
744
745 def find_user_password(self, realm, authuri):
746 user, password = HTTPPasswordMgr.find_user_password(self, realm,
747 authuri)
748 if user is not None:
749 return user, password
750 return HTTPPasswordMgr.find_user_password(self, None, authuri)
751
752
753class AbstractBasicAuthHandler:
754
755 # XXX this allows for multiple auth-schemes, but will stupidly pick
756 # the last one with a realm specified.
757
758 # allow for double- and single-quoted realm values
759 # (single quotes are a violation of the RFC, but appear in the wild)
760 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
761 'realm=(["\'])(.*?)\\2', re.I)
762
763 # XXX could pre-emptively send auth info already accepted (RFC 2617,
764 # end of section 2, and section 1.2 immediately after "credentials"
765 # production).
766
767 def __init__(self, password_mgr=None):
768 if password_mgr is None:
769 password_mgr = HTTPPasswordMgr()
770 self.passwd = password_mgr
771 self.add_password = self.passwd.add_password
772
773 def http_error_auth_reqed(self, authreq, host, req, headers):
774 # host may be an authority (without userinfo) or a URL with an
775 # authority
776 # XXX could be multiple headers
777 authreq = headers.get(authreq, None)
778 if authreq:
779 mo = AbstractBasicAuthHandler.rx.search(authreq)
780 if mo:
781 scheme, quote, realm = mo.groups()
782 if scheme.lower() == 'basic':
783 return self.retry_http_basic_auth(host, req, realm)
784
785 def retry_http_basic_auth(self, host, req, realm):
786 user, pw = self.passwd.find_user_password(realm, host)
787 if pw is not None:
788 raw = "%s:%s" % (user, pw)
789 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
790 if req.headers.get(self.auth_header, None) == auth:
791 return None
792 req.add_header(self.auth_header, auth)
793 return self.parent.open(req)
794 else:
795 return None
796
797
798class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
799
800 auth_header = 'Authorization'
801
802 def http_error_401(self, req, fp, code, msg, headers):
803 url = req.get_full_url()
804 return self.http_error_auth_reqed('www-authenticate',
805 url, req, headers)
806
807
808class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
809
810 auth_header = 'Proxy-authorization'
811
812 def http_error_407(self, req, fp, code, msg, headers):
813 # http_error_auth_reqed requires that there is no userinfo component in
814 # authority. Assume there isn't one, since urllib2 does not (and
815 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
816 # userinfo.
817 authority = req.get_host()
818 return self.http_error_auth_reqed('proxy-authenticate',
819 authority, req, headers)
820
821
822def randombytes(n):
823 """Return n random bytes."""
824 return os.urandom(n)
825
826class AbstractDigestAuthHandler:
827 # Digest authentication is specified in RFC 2617.
828
829 # XXX The client does not inspect the Authentication-Info header
830 # in a successful response.
831
832 # XXX It should be possible to test this implementation against
833 # a mock server that just generates a static set of challenges.
834
835 # XXX qop="auth-int" supports is shaky
836
837 def __init__(self, passwd=None):
838 if passwd is None:
839 passwd = HTTPPasswordMgr()
840 self.passwd = passwd
841 self.add_password = self.passwd.add_password
842 self.retried = 0
843 self.nonce_count = 0
844
845 def reset_retry_count(self):
846 self.retried = 0
847
848 def http_error_auth_reqed(self, auth_header, host, req, headers):
849 authreq = headers.get(auth_header, None)
850 if self.retried > 5:
851 # Don't fail endlessly - if we failed once, we'll probably
852 # fail a second time. Hm. Unless the Password Manager is
853 # prompting for the information. Crap. This isn't great
854 # but it's better than the current 'repeat until recursion
855 # depth exceeded' approach <wink>
856 raise urllib.error.HTTPError(req.get_full_url(), 401,
857 "digest auth failed",
858 headers, None)
859 else:
860 self.retried += 1
861 if authreq:
862 scheme = authreq.split()[0]
863 if scheme.lower() == 'digest':
864 return self.retry_http_digest_auth(req, authreq)
865
866 def retry_http_digest_auth(self, req, auth):
867 token, challenge = auth.split(' ', 1)
868 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
869 auth = self.get_authorization(req, chal)
870 if auth:
871 auth_val = 'Digest %s' % auth
872 if req.headers.get(self.auth_header, None) == auth_val:
873 return None
874 req.add_unredirected_header(self.auth_header, auth_val)
875 resp = self.parent.open(req)
876 return resp
877
878 def get_cnonce(self, nonce):
879 # The cnonce-value is an opaque
880 # quoted string value provided by the client and used by both client
881 # and server to avoid chosen plaintext attacks, to provide mutual
882 # authentication, and to provide some message integrity protection.
883 # This isn't a fabulous effort, but it's probably Good Enough.
884 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
885 b = s.encode("ascii") + randombytes(8)
886 dig = hashlib.sha1(b).hexdigest()
887 return dig[:16]
888
889 def get_authorization(self, req, chal):
890 try:
891 realm = chal['realm']
892 nonce = chal['nonce']
893 qop = chal.get('qop')
894 algorithm = chal.get('algorithm', 'MD5')
895 # mod_digest doesn't send an opaque, even though it isn't
896 # supposed to be optional
897 opaque = chal.get('opaque', None)
898 except KeyError:
899 return None
900
901 H, KD = self.get_algorithm_impls(algorithm)
902 if H is None:
903 return None
904
905 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
906 if user is None:
907 return None
908
909 # XXX not implemented yet
910 if req.has_data():
911 entdig = self.get_entity_digest(req.get_data(), chal)
912 else:
913 entdig = None
914
915 A1 = "%s:%s:%s" % (user, realm, pw)
916 A2 = "%s:%s" % (req.get_method(),
917 # XXX selector: what about proxies and full urls
918 req.get_selector())
919 if qop == 'auth':
920 self.nonce_count += 1
921 ncvalue = '%08x' % self.nonce_count
922 cnonce = self.get_cnonce(nonce)
923 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
924 respdig = KD(H(A1), noncebit)
925 elif qop is None:
926 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
927 else:
928 # XXX handle auth-int.
929 raise urllib.error.URLError("qop '%s' is not supported." % qop)
930
931 # XXX should the partial digests be encoded too?
932
933 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
934 'response="%s"' % (user, realm, nonce, req.get_selector(),
935 respdig)
936 if opaque:
937 base += ', opaque="%s"' % opaque
938 if entdig:
939 base += ', digest="%s"' % entdig
940 base += ', algorithm="%s"' % algorithm
941 if qop:
942 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
943 return base
944
945 def get_algorithm_impls(self, algorithm):
946 # lambdas assume digest modules are imported at the top level
947 if algorithm == 'MD5':
948 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
949 elif algorithm == 'SHA':
950 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
951 # XXX MD5-sess
952 KD = lambda s, d: H("%s:%s" % (s, d))
953 return H, KD
954
955 def get_entity_digest(self, data, chal):
956 # XXX not implemented yet
957 return None
958
959
960class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
961 """An authentication protocol defined by RFC 2069
962
963 Digest authentication improves on basic authentication because it
964 does not transmit passwords in the clear.
965 """
966
967 auth_header = 'Authorization'
968 handler_order = 490 # before Basic auth
969
970 def http_error_401(self, req, fp, code, msg, headers):
971 host = urllib.parse.urlparse(req.get_full_url())[1]
972 retry = self.http_error_auth_reqed('www-authenticate',
973 host, req, headers)
974 self.reset_retry_count()
975 return retry
976
977
978class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
979
980 auth_header = 'Proxy-Authorization'
981 handler_order = 490 # before Basic auth
982
983 def http_error_407(self, req, fp, code, msg, headers):
984 host = req.get_host()
985 retry = self.http_error_auth_reqed('proxy-authenticate',
986 host, req, headers)
987 self.reset_retry_count()
988 return retry
989
990class AbstractHTTPHandler(BaseHandler):
991
992 def __init__(self, debuglevel=0):
993 self._debuglevel = debuglevel
994
995 def set_http_debuglevel(self, level):
996 self._debuglevel = level
997
998 def do_request_(self, request):
999 host = request.get_host()
1000 if not host:
1001 raise urllib.error.URLError('no host given')
1002
1003 if request.has_data(): # POST
1004 data = request.get_data()
1005 if not request.has_header('Content-type'):
1006 request.add_unredirected_header(
1007 'Content-type',
1008 'application/x-www-form-urlencoded')
1009 if not request.has_header('Content-length'):
1010 request.add_unredirected_header(
1011 'Content-length', '%d' % len(data))
1012
1013 scheme, sel = urllib.parse.splittype(request.get_selector())
1014 sel_host, sel_path = urllib.parse.splithost(sel)
1015 if not request.has_header('Host'):
1016 request.add_unredirected_header('Host', sel_host or host)
1017 for name, value in self.parent.addheaders:
1018 name = name.capitalize()
1019 if not request.has_header(name):
1020 request.add_unredirected_header(name, value)
1021
1022 return request
1023
1024 def do_open(self, http_class, req):
1025 """Return an addinfourl object for the request, using http_class.
1026
1027 http_class must implement the HTTPConnection API from http.client.
1028 The addinfourl return value is a file-like object. It also
1029 has methods and attributes including:
1030 - info(): return a mimetools.Message object for the headers
1031 - geturl(): return the original request URL
1032 - code: HTTP status code
1033 """
1034 host = req.get_host()
1035 if not host:
1036 raise urllib.error.URLError('no host given')
1037
1038 h = http_class(host, timeout=req.timeout) # will parse host:port
1039 headers = dict(req.headers)
1040 headers.update(req.unredirected_hdrs)
1041
1042 # TODO(jhylton): Should this be redesigned to handle
1043 # persistent connections?
1044
1045 # We want to make an HTTP/1.1 request, but the addinfourl
1046 # class isn't prepared to deal with a persistent connection.
1047 # It will try to read all remaining data from the socket,
1048 # which will block while the server waits for the next request.
1049 # So make sure the connection gets closed after the (only)
1050 # request.
1051 headers["Connection"] = "close"
1052 headers = dict(
1053 (name.title(), val) for name, val in headers.items())
1054 try:
1055 h.request(req.get_method(), req.get_selector(), req.data, headers)
1056 r = h.getresponse()
1057 except socket.error as err: # XXX what error?
1058 raise urllib.error.URLError(err)
1059
1060 resp = urllib.response.addinfourl(r.fp, r.msg, req.get_full_url())
1061 resp.code = r.status
1062 resp.msg = r.reason
1063 return resp
1064
1065
1066class HTTPHandler(AbstractHTTPHandler):
1067
1068 def http_open(self, req):
1069 return self.do_open(http.client.HTTPConnection, req)
1070
1071 http_request = AbstractHTTPHandler.do_request_
1072
1073if hasattr(http.client, 'HTTPSConnection'):
1074 class HTTPSHandler(AbstractHTTPHandler):
1075
1076 def https_open(self, req):
1077 return self.do_open(http.client.HTTPSConnection, req)
1078
1079 https_request = AbstractHTTPHandler.do_request_
1080
1081class HTTPCookieProcessor(BaseHandler):
1082 def __init__(self, cookiejar=None):
1083 import http.cookiejar
1084 if cookiejar is None:
1085 cookiejar = http.cookiejar.CookieJar()
1086 self.cookiejar = cookiejar
1087
1088 def http_request(self, request):
1089 self.cookiejar.add_cookie_header(request)
1090 return request
1091
1092 def http_response(self, request, response):
1093 self.cookiejar.extract_cookies(response, request)
1094 return response
1095
1096 https_request = http_request
1097 https_response = http_response
1098
1099class UnknownHandler(BaseHandler):
1100 def unknown_open(self, req):
1101 type = req.get_type()
1102 raise urllib.error.URLError('unknown url type: %s' % type)
1103
1104def parse_keqv_list(l):
1105 """Parse list of key=value strings where keys are not duplicated."""
1106 parsed = {}
1107 for elt in l:
1108 k, v = elt.split('=', 1)
1109 if v[0] == '"' and v[-1] == '"':
1110 v = v[1:-1]
1111 parsed[k] = v
1112 return parsed
1113
1114def parse_http_list(s):
1115 """Parse lists as described by RFC 2068 Section 2.
1116
1117 In particular, parse comma-separated lists where the elements of
1118 the list may include quoted-strings. A quoted-string could
1119 contain a comma. A non-quoted string could have quotes in the
1120 middle. Neither commas nor quotes count if they are escaped.
1121 Only double-quotes count, not single-quotes.
1122 """
1123 res = []
1124 part = ''
1125
1126 escape = quote = False
1127 for cur in s:
1128 if escape:
1129 part += cur
1130 escape = False
1131 continue
1132 if quote:
1133 if cur == '\\':
1134 escape = True
1135 continue
1136 elif cur == '"':
1137 quote = False
1138 part += cur
1139 continue
1140
1141 if cur == ',':
1142 res.append(part)
1143 part = ''
1144 continue
1145
1146 if cur == '"':
1147 quote = True
1148
1149 part += cur
1150
1151 # append last part
1152 if part:
1153 res.append(part)
1154
1155 return [part.strip() for part in res]
1156
1157class FileHandler(BaseHandler):
1158 # Use local file or FTP depending on form of URL
1159 def file_open(self, req):
1160 url = req.get_selector()
1161 if url[:2] == '//' and url[2:3] != '/':
1162 req.type = 'ftp'
1163 return self.parent.open(req)
1164 else:
1165 return self.open_local_file(req)
1166
1167 # names for the localhost
1168 names = None
1169 def get_names(self):
1170 if FileHandler.names is None:
1171 try:
1172 FileHandler.names = (socket.gethostbyname('localhost'),
1173 socket.gethostbyname(socket.gethostname()))
1174 except socket.gaierror:
1175 FileHandler.names = (socket.gethostbyname('localhost'),)
1176 return FileHandler.names
1177
1178 # not entirely sure what the rules are here
1179 def open_local_file(self, req):
1180 import email.utils
1181 import mimetypes
1182 host = req.get_host()
1183 file = req.get_selector()
1184 localfile = url2pathname(file)
1185 try:
1186 stats = os.stat(localfile)
1187 size = stats.st_size
1188 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1189 mtype = mimetypes.guess_type(file)[0]
1190 headers = email.message_from_string(
1191 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1192 (mtype or 'text/plain', size, modified))
1193 if host:
1194 host, port = urllib.parse.splitport(host)
1195 if not host or \
1196 (not port and _safe_gethostbyname(host) in self.get_names()):
1197 return urllib.response.addinfourl(open(localfile, 'rb'),
1198 headers, 'file:'+file)
1199 except OSError as msg:
1200 # urllib2 users shouldn't expect OSErrors coming from urlopen()
1201 raise urllib.error.URLError(msg)
1202 raise urllib.error.URLError('file not on local host')
1203
1204def _safe_gethostbyname(host):
1205 try:
1206 return socket.gethostbyname(host)
1207 except socket.gaierror:
1208 return None
1209
1210class FTPHandler(BaseHandler):
1211 def ftp_open(self, req):
1212 import ftplib
1213 import mimetypes
1214 host = req.get_host()
1215 if not host:
1216 raise urllib.error.URLError('ftp error: no host given')
1217 host, port = urllib.parse.splitport(host)
1218 if port is None:
1219 port = ftplib.FTP_PORT
1220 else:
1221 port = int(port)
1222
1223 # username/password handling
1224 user, host = urllib.parse.splituser(host)
1225 if user:
1226 user, passwd = urllib.parse.splitpasswd(user)
1227 else:
1228 passwd = None
1229 host = urllib.parse.unquote(host)
1230 user = urllib.parse.unquote(user or '')
1231 passwd = urllib.parse.unquote(passwd or '')
1232
1233 try:
1234 host = socket.gethostbyname(host)
1235 except socket.error as msg:
1236 raise urllib.error.URLError(msg)
1237 path, attrs = urllib.parse.splitattr(req.get_selector())
1238 dirs = path.split('/')
1239 dirs = list(map(urllib.parse.unquote, dirs))
1240 dirs, file = dirs[:-1], dirs[-1]
1241 if dirs and not dirs[0]:
1242 dirs = dirs[1:]
1243 try:
1244 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1245 type = file and 'I' or 'D'
1246 for attr in attrs:
1247 attr, value = urllib.parse.splitvalue(attr)
1248 if attr.lower() == 'type' and \
1249 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1250 type = value.upper()
1251 fp, retrlen = fw.retrfile(file, type)
1252 headers = ""
1253 mtype = mimetypes.guess_type(req.get_full_url())[0]
1254 if mtype:
1255 headers += "Content-type: %s\n" % mtype
1256 if retrlen is not None and retrlen >= 0:
1257 headers += "Content-length: %d\n" % retrlen
1258 headers = email.message_from_string(headers)
1259 return urllib.response.addinfourl(fp, headers, req.get_full_url())
1260 except ftplib.all_errors as msg:
1261 exc = urllib.error.URLError('ftp error: %s' % msg)
1262 raise exc.with_traceback(sys.exc_info()[2])
1263
1264 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1265 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1266 return fw
1267
1268class CacheFTPHandler(FTPHandler):
1269 # XXX would be nice to have pluggable cache strategies
1270 # XXX this stuff is definitely not thread safe
1271 def __init__(self):
1272 self.cache = {}
1273 self.timeout = {}
1274 self.soonest = 0
1275 self.delay = 60
1276 self.max_conns = 16
1277
1278 def setTimeout(self, t):
1279 self.delay = t
1280
1281 def setMaxConns(self, m):
1282 self.max_conns = m
1283
1284 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1285 key = user, host, port, '/'.join(dirs), timeout
1286 if key in self.cache:
1287 self.timeout[key] = time.time() + self.delay
1288 else:
1289 self.cache[key] = ftpwrapper(user, passwd, host, port,
1290 dirs, timeout)
1291 self.timeout[key] = time.time() + self.delay
1292 self.check_cache()
1293 return self.cache[key]
1294
1295 def check_cache(self):
1296 # first check for old ones
1297 t = time.time()
1298 if self.soonest <= t:
1299 for k, v in list(self.timeout.items()):
1300 if v < t:
1301 self.cache[k].close()
1302 del self.cache[k]
1303 del self.timeout[k]
1304 self.soonest = min(list(self.timeout.values()))
1305
1306 # then check the size
1307 if len(self.cache) == self.max_conns:
1308 for k, v in list(self.timeout.items()):
1309 if v == self.soonest:
1310 del self.cache[k]
1311 del self.timeout[k]
1312 break
1313 self.soonest = min(list(self.timeout.values()))
1314
1315# Code move from the old urllib module
1316
1317MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1318
1319# Helper for non-unix systems
1320if os.name == 'mac':
1321 from macurl2path import url2pathname, pathname2url
1322elif os.name == 'nt':
1323 from nturl2path import url2pathname, pathname2url
1324else:
1325 def url2pathname(pathname):
1326 """OS-specific conversion from a relative URL of the 'file' scheme
1327 to a file system path; not recommended for general use."""
1328 return urllib.parse.unquote(pathname)
1329
1330 def pathname2url(pathname):
1331 """OS-specific conversion from a file system path to a relative URL
1332 of the 'file' scheme; not recommended for general use."""
1333 return urllib.parse.quote(pathname)
1334
1335# This really consists of two pieces:
1336# (1) a class which handles opening of all sorts of URLs
1337# (plus assorted utilities etc.)
1338# (2) a set of functions for parsing URLs
1339# XXX Should these be separated out into different modules?
1340
1341
1342ftpcache = {}
1343class URLopener:
1344 """Class to open URLs.
1345 This is a class rather than just a subroutine because we may need
1346 more than one set of global protocol-specific options.
1347 Note -- this is a base class for those who don't want the
1348 automatic handling of errors type 302 (relocated) and 401
1349 (authorization needed)."""
1350
1351 __tempfiles = None
1352
1353 version = "Python-urllib/%s" % __version__
1354
1355 # Constructor
1356 def __init__(self, proxies=None, **x509):
1357 if proxies is None:
1358 proxies = getproxies()
1359 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1360 self.proxies = proxies
1361 self.key_file = x509.get('key_file')
1362 self.cert_file = x509.get('cert_file')
1363 self.addheaders = [('User-Agent', self.version)]
1364 self.__tempfiles = []
1365 self.__unlink = os.unlink # See cleanup()
1366 self.tempcache = None
1367 # Undocumented feature: if you assign {} to tempcache,
1368 # it is used to cache files retrieved with
1369 # self.retrieve(). This is not enabled by default
1370 # since it does not work for changing documents (and I
1371 # haven't got the logic to check expiration headers
1372 # yet).
1373 self.ftpcache = ftpcache
1374 # Undocumented feature: you can use a different
1375 # ftp cache by assigning to the .ftpcache member;
1376 # in case you want logically independent URL openers
1377 # XXX This is not threadsafe. Bah.
1378
1379 def __del__(self):
1380 self.close()
1381
1382 def close(self):
1383 self.cleanup()
1384
1385 def cleanup(self):
1386 # This code sometimes runs when the rest of this module
1387 # has already been deleted, so it can't use any globals
1388 # or import anything.
1389 if self.__tempfiles:
1390 for file in self.__tempfiles:
1391 try:
1392 self.__unlink(file)
1393 except OSError:
1394 pass
1395 del self.__tempfiles[:]
1396 if self.tempcache:
1397 self.tempcache.clear()
1398
1399 def addheader(self, *args):
1400 """Add a header to be used by the HTTP interface only
1401 e.g. u.addheader('Accept', 'sound/basic')"""
1402 self.addheaders.append(args)
1403
1404 # External interface
1405 def open(self, fullurl, data=None):
1406 """Use URLopener().open(file) instead of open(file, 'r')."""
1407 fullurl = urllib.parse.unwrap(urllib.parse.toBytes(fullurl))
1408 if self.tempcache and fullurl in self.tempcache:
1409 filename, headers = self.tempcache[fullurl]
1410 fp = open(filename, 'rb')
1411 return urllib.response.addinfourl(fp, headers, fullurl)
1412 urltype, url = urllib.parse.splittype(fullurl)
1413 if not urltype:
1414 urltype = 'file'
1415 if urltype in self.proxies:
1416 proxy = self.proxies[urltype]
1417 urltype, proxyhost = urllib.parse.splittype(proxy)
1418 host, selector = urllib.parse.splithost(proxyhost)
1419 url = (host, fullurl) # Signal special case to open_*()
1420 else:
1421 proxy = None
1422 name = 'open_' + urltype
1423 self.type = urltype
1424 name = name.replace('-', '_')
1425 if not hasattr(self, name):
1426 if proxy:
1427 return self.open_unknown_proxy(proxy, fullurl, data)
1428 else:
1429 return self.open_unknown(fullurl, data)
1430 try:
1431 if data is None:
1432 return getattr(self, name)(url)
1433 else:
1434 return getattr(self, name)(url, data)
1435 except socket.error as msg:
1436 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1437
1438 def open_unknown(self, fullurl, data=None):
1439 """Overridable interface to open unknown URL type."""
1440 type, url = urllib.parse.splittype(fullurl)
1441 raise IOError('url error', 'unknown url type', type)
1442
1443 def open_unknown_proxy(self, proxy, fullurl, data=None):
1444 """Overridable interface to open unknown URL type."""
1445 type, url = urllib.parse.splittype(fullurl)
1446 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1447
1448 # External interface
1449 def retrieve(self, url, filename=None, reporthook=None, data=None):
1450 """retrieve(url) returns (filename, headers) for a local object
1451 or (tempfilename, headers) for a remote object."""
1452 url = urllib.parse.unwrap(urllib.parse.toBytes(url))
1453 if self.tempcache and url in self.tempcache:
1454 return self.tempcache[url]
1455 type, url1 = urllib.parse.splittype(url)
1456 if filename is None and (not type or type == 'file'):
1457 try:
1458 fp = self.open_local_file(url1)
1459 hdrs = fp.info()
1460 del fp
1461 return url2pathname(urllib.parse.splithost(url1)[1]), hdrs
1462 except IOError as msg:
1463 pass
1464 fp = self.open(url, data)
1465 headers = fp.info()
1466 if filename:
1467 tfp = open(filename, 'wb')
1468 else:
1469 import tempfile
1470 garbage, path = urllib.parse.splittype(url)
1471 garbage, path = urllib.parse.splithost(path or "")
1472 path, garbage = urllib.parse.splitquery(path or "")
1473 path, garbage = urllib.parse.splitattr(path or "")
1474 suffix = os.path.splitext(path)[1]
1475 (fd, filename) = tempfile.mkstemp(suffix)
1476 self.__tempfiles.append(filename)
1477 tfp = os.fdopen(fd, 'wb')
1478 result = filename, headers
1479 if self.tempcache is not None:
1480 self.tempcache[url] = result
1481 bs = 1024*8
1482 size = -1
1483 read = 0
1484 blocknum = 0
1485 if reporthook:
1486 if "content-length" in headers:
1487 size = int(headers["Content-Length"])
1488 reporthook(blocknum, bs, size)
1489 while 1:
1490 block = fp.read(bs)
1491 if not block:
1492 break
1493 read += len(block)
1494 tfp.write(block)
1495 blocknum += 1
1496 if reporthook:
1497 reporthook(blocknum, bs, size)
1498 fp.close()
1499 tfp.close()
1500 del fp
1501 del tfp
1502
1503 # raise exception if actual size does not match content-length header
1504 if size >= 0 and read < size:
1505 raise urllib.error.ContentTooShortError(
1506 "retrieval incomplete: got only %i out of %i bytes"
1507 % (read, size), result)
1508
1509 return result
1510
1511 # Each method named open_<type> knows how to open that type of URL
1512
1513 def _open_generic_http(self, connection_factory, url, data):
1514 """Make an HTTP connection using connection_class.
1515
1516 This is an internal method that should be called from
1517 open_http() or open_https().
1518
1519 Arguments:
1520 - connection_factory should take a host name and return an
1521 HTTPConnection instance.
1522 - url is the url to retrieval or a host, relative-path pair.
1523 - data is payload for a POST request or None.
1524 """
1525
1526 user_passwd = None
1527 proxy_passwd= None
1528 if isinstance(url, str):
1529 host, selector = urllib.parse.splithost(url)
1530 if host:
1531 user_passwd, host = urllib.parse.splituser(host)
1532 host = urllib.parse.unquote(host)
1533 realhost = host
1534 else:
1535 host, selector = url
1536 # check whether the proxy contains authorization information
1537 proxy_passwd, host = urllib.parse.splituser(host)
1538 # now we proceed with the url we want to obtain
1539 urltype, rest = urllib.parse.splittype(selector)
1540 url = rest
1541 user_passwd = None
1542 if urltype.lower() != 'http':
1543 realhost = None
1544 else:
1545 realhost, rest = urllib.parse.splithost(rest)
1546 if realhost:
1547 user_passwd, realhost = urllib.parse.splituser(realhost)
1548 if user_passwd:
1549 selector = "%s://%s%s" % (urltype, realhost, rest)
1550 if proxy_bypass(realhost):
1551 host = realhost
1552
1553 #print "proxy via http:", host, selector
1554 if not host: raise IOError('http error', 'no host given')
1555
1556 if proxy_passwd:
1557 import base64
1558 proxy_auth = base64.b64encode(proxy_passwd).strip()
1559 else:
1560 proxy_auth = None
1561
1562 if user_passwd:
1563 import base64
1564 auth = base64.b64encode(user_passwd).strip()
1565 else:
1566 auth = None
1567 http_conn = connection_factory(host)
1568 # XXX We should fix urllib so that it works with HTTP/1.1.
1569 http_conn._http_vsn = 10
1570 http_conn._http_vsn_str = "HTTP/1.0"
1571
1572 headers = {}
1573 if proxy_auth:
1574 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1575 if auth:
1576 headers["Authorization"] = "Basic %s" % auth
1577 if realhost:
1578 headers["Host"] = realhost
1579 for header, value in self.addheaders:
1580 headers[header] = value
1581
1582 if data is not None:
1583 headers["Content-Type"] = "application/x-www-form-urlencoded"
1584 http_conn.request("POST", selector, data, headers)
1585 else:
1586 http_conn.request("GET", selector, headers=headers)
1587
1588 try:
1589 response = http_conn.getresponse()
1590 except http.client.BadStatusLine:
1591 # something went wrong with the HTTP status line
1592 raise urllib.error.URLError("http protocol error: bad status line")
1593
1594 # According to RFC 2616, "2xx" code indicates that the client's
1595 # request was successfully received, understood, and accepted.
1596 if 200 <= response.status < 300:
1597 return urllib.response.addinfourl(response.fp, response.msg,
1598 "http:" + url,
1599 response.status)
1600 else:
1601 return self.http_error(
1602 url, response.fp,
1603 response.status, response.reason, response.msg, data)
1604
1605 def open_http(self, url, data=None):
1606 """Use HTTP protocol."""
1607 return self._open_generic_http(http.client.HTTPConnection, url, data)
1608
1609 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1610 """Handle http errors.
1611
1612 Derived class can override this, or provide specific handlers
1613 named http_error_DDD where DDD is the 3-digit error code."""
1614 # First check if there's a specific handler for this error
1615 name = 'http_error_%d' % errcode
1616 if hasattr(self, name):
1617 method = getattr(self, name)
1618 if data is None:
1619 result = method(url, fp, errcode, errmsg, headers)
1620 else:
1621 result = method(url, fp, errcode, errmsg, headers, data)
1622 if result: return result
1623 return self.http_error_default(url, fp, errcode, errmsg, headers)
1624
1625 def http_error_default(self, url, fp, errcode, errmsg, headers):
1626 """Default error handler: close the connection and raise IOError."""
1627 void = fp.read()
1628 fp.close()
1629 raise urllib.error.HTTPError(url, errcode, errmsg, headers, None)
1630
1631 if _have_ssl:
1632 def _https_connection(self, host):
1633 return http.client.HTTPSConnection(host,
1634 key_file=self.key_file,
1635 cert_file=self.cert_file)
1636
1637 def open_https(self, url, data=None):
1638 """Use HTTPS protocol."""
1639 return self._open_generic_http(self._https_connection, url, data)
1640
1641 def open_file(self, url):
1642 """Use local file or FTP depending on form of URL."""
1643 if not isinstance(url, str):
1644 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1645 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1646 return self.open_ftp(url)
1647 else:
1648 return self.open_local_file(url)
1649
1650 def open_local_file(self, url):
1651 """Use local file."""
1652 import mimetypes, email.utils
1653 from io import StringIO
1654 host, file = urllib.parse.splithost(url)
1655 localname = url2pathname(file)
1656 try:
1657 stats = os.stat(localname)
1658 except OSError as e:
1659 raise URLError(e.errno, e.strerror, e.filename)
1660 size = stats.st_size
1661 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1662 mtype = mimetypes.guess_type(url)[0]
1663 headers = email.message_from_string(
1664 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1665 (mtype or 'text/plain', size, modified))
1666 if not host:
1667 urlfile = file
1668 if file[:1] == '/':
1669 urlfile = 'file://' + file
1670 return urllib.response.addinfourl(open(localname, 'rb'),
1671 headers, urlfile)
1672 host, port = urllib.parse.splitport(host)
1673 if (not port
1674 and socket.gethostbyname(host) in (localhost(), thishost())):
1675 urlfile = file
1676 if file[:1] == '/':
1677 urlfile = 'file://' + file
1678 return urllib.response.addinfourl(open(localname, 'rb'),
1679 headers, urlfile)
1680 raise URLError('local file error', 'not on local host')
1681
1682 def open_ftp(self, url):
1683 """Use FTP protocol."""
1684 if not isinstance(url, str):
1685 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1686 import mimetypes
1687 from io import StringIO
1688 host, path = urllib.parse.splithost(url)
1689 if not host: raise URLError('ftp error', 'no host given')
1690 host, port = urllib.parse.splitport(host)
1691 user, host = urllib.parse.splituser(host)
1692 if user: user, passwd = urllib.parse.splitpasswd(user)
1693 else: passwd = None
1694 host = urllib.parse.unquote(host)
1695 user = urllib.parse.unquote(user or '')
1696 passwd = urllib.parse.unquote(passwd or '')
1697 host = socket.gethostbyname(host)
1698 if not port:
1699 import ftplib
1700 port = ftplib.FTP_PORT
1701 else:
1702 port = int(port)
1703 path, attrs = urllib.parse.splitattr(path)
1704 path = urllib.parse.unquote(path)
1705 dirs = path.split('/')
1706 dirs, file = dirs[:-1], dirs[-1]
1707 if dirs and not dirs[0]: dirs = dirs[1:]
1708 if dirs and not dirs[0]: dirs[0] = '/'
1709 key = user, host, port, '/'.join(dirs)
1710 # XXX thread unsafe!
1711 if len(self.ftpcache) > MAXFTPCACHE:
1712 # Prune the cache, rather arbitrarily
1713 for k in self.ftpcache.keys():
1714 if k != key:
1715 v = self.ftpcache[k]
1716 del self.ftpcache[k]
1717 v.close()
1718 try:
1719 if not key in self.ftpcache:
1720 self.ftpcache[key] = \
1721 ftpwrapper(user, passwd, host, port, dirs)
1722 if not file: type = 'D'
1723 else: type = 'I'
1724 for attr in attrs:
1725 attr, value = urllib.parse.splitvalue(attr)
1726 if attr.lower() == 'type' and \
1727 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1728 type = value.upper()
1729 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1730 mtype = mimetypes.guess_type("ftp:" + url)[0]
1731 headers = ""
1732 if mtype:
1733 headers += "Content-Type: %s\n" % mtype
1734 if retrlen is not None and retrlen >= 0:
1735 headers += "Content-Length: %d\n" % retrlen
1736 headers = email.message_from_string(headers)
1737 return urllib.response.addinfourl(fp, headers, "ftp:" + url)
1738 except ftperrors() as msg:
1739 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1740
1741 def open_data(self, url, data=None):
1742 """Use "data" URL."""
1743 if not isinstance(url, str):
1744 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1745 # ignore POSTed data
1746 #
1747 # syntax of data URLs:
1748 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1749 # mediatype := [ type "/" subtype ] *( ";" parameter )
1750 # data := *urlchar
1751 # parameter := attribute "=" value
1752 try:
1753 [type, data] = url.split(',', 1)
1754 except ValueError:
1755 raise IOError('data error', 'bad data URL')
1756 if not type:
1757 type = 'text/plain;charset=US-ASCII'
1758 semi = type.rfind(';')
1759 if semi >= 0 and '=' not in type[semi:]:
1760 encoding = type[semi+1:]
1761 type = type[:semi]
1762 else:
1763 encoding = ''
1764 msg = []
1765 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
1766 time.gmtime(time.time())))
1767 msg.append('Content-type: %s' % type)
1768 if encoding == 'base64':
1769 import base64
1770 data = base64.decodestring(data)
1771 else:
1772 data = urllib.parse.unquote(data)
1773 msg.append('Content-Length: %d' % len(data))
1774 msg.append('')
1775 msg.append(data)
1776 msg = '\n'.join(msg)
1777 headers = mimetools.message_from_string(msg)
1778 #f.fileno = None # needed for addinfourl
1779 return urllib.response.addinfourl(f, headers, url)
1780
1781
1782class FancyURLopener(URLopener):
1783 """Derived class with handlers for errors we can handle (perhaps)."""
1784
1785 def __init__(self, *args, **kwargs):
1786 URLopener.__init__(self, *args, **kwargs)
1787 self.auth_cache = {}
1788 self.tries = 0
1789 self.maxtries = 10
1790
1791 def http_error_default(self, url, fp, errcode, errmsg, headers):
1792 """Default error handling -- don't raise an exception."""
1793 return urllib.response.addinfourl(fp, headers, "http:" + url, errcode)
1794
1795 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1796 """Error 302 -- relocated (temporarily)."""
1797 self.tries += 1
1798 if self.maxtries and self.tries >= self.maxtries:
1799 if hasattr(self, "http_error_500"):
1800 meth = self.http_error_500
1801 else:
1802 meth = self.http_error_default
1803 self.tries = 0
1804 return meth(url, fp, 500,
1805 "Internal Server Error: Redirect Recursion", headers)
1806 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1807 data)
1808 self.tries = 0
1809 return result
1810
1811 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1812 if 'location' in headers:
1813 newurl = headers['location']
1814 elif 'uri' in headers:
1815 newurl = headers['uri']
1816 else:
1817 return
1818 void = fp.read()
1819 fp.close()
1820 # In case the server sent a relative URL, join with original:
1821 newurl = basejoin(self.type + ":" + url, newurl)
1822 return self.open(newurl)
1823
1824 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1825 """Error 301 -- also relocated (permanently)."""
1826 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1827
1828 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1829 """Error 303 -- also relocated (essentially identical to 302)."""
1830 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1831
1832 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1833 """Error 307 -- relocated, but turn POST into error."""
1834 if data is None:
1835 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1836 else:
1837 return self.http_error_default(url, fp, errcode, errmsg, headers)
1838
1839 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
1840 """Error 401 -- authentication required.
1841 This function supports Basic authentication only."""
1842 if not 'www-authenticate' in headers:
1843 URLopener.http_error_default(self, url, fp,
1844 errcode, errmsg, headers)
1845 stuff = headers['www-authenticate']
1846 import re
1847 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1848 if not match:
1849 URLopener.http_error_default(self, url, fp,
1850 errcode, errmsg, headers)
1851 scheme, realm = match.groups()
1852 if scheme.lower() != 'basic':
1853 URLopener.http_error_default(self, url, fp,
1854 errcode, errmsg, headers)
1855 name = 'retry_' + self.type + '_basic_auth'
1856 if data is None:
1857 return getattr(self,name)(url, realm)
1858 else:
1859 return getattr(self,name)(url, realm, data)
1860
1861 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
1862 """Error 407 -- proxy authentication required.
1863 This function supports Basic authentication only."""
1864 if not 'proxy-authenticate' in headers:
1865 URLopener.http_error_default(self, url, fp,
1866 errcode, errmsg, headers)
1867 stuff = headers['proxy-authenticate']
1868 import re
1869 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1870 if not match:
1871 URLopener.http_error_default(self, url, fp,
1872 errcode, errmsg, headers)
1873 scheme, realm = match.groups()
1874 if scheme.lower() != 'basic':
1875 URLopener.http_error_default(self, url, fp,
1876 errcode, errmsg, headers)
1877 name = 'retry_proxy_' + self.type + '_basic_auth'
1878 if data is None:
1879 return getattr(self,name)(url, realm)
1880 else:
1881 return getattr(self,name)(url, realm, data)
1882
1883 def retry_proxy_http_basic_auth(self, url, realm, data=None):
1884 host, selector = urllib.parse.splithost(url)
1885 newurl = 'http://' + host + selector
1886 proxy = self.proxies['http']
1887 urltype, proxyhost = urllib.parse.splittype(proxy)
1888 proxyhost, proxyselector = urllib.parse.splithost(proxyhost)
1889 i = proxyhost.find('@') + 1
1890 proxyhost = proxyhost[i:]
1891 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1892 if not (user or passwd): return None
1893 proxyhost = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1894 quote(passwd, safe=''), proxyhost)
1895 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1896 if data is None:
1897 return self.open(newurl)
1898 else:
1899 return self.open(newurl, data)
1900
1901 def retry_proxy_https_basic_auth(self, url, realm, data=None):
1902 host, selector = urllib.parse.splithost(url)
1903 newurl = 'https://' + host + selector
1904 proxy = self.proxies['https']
1905 urltype, proxyhost = urllib.parse.splittype(proxy)
1906 proxyhost, proxyselector = urllib.parse.splithost(proxyhost)
1907 i = proxyhost.find('@') + 1
1908 proxyhost = proxyhost[i:]
1909 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1910 if not (user or passwd): return None
1911 proxyhost = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1912 quote(passwd, safe=''), proxyhost)
1913 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1914 if data is None:
1915 return self.open(newurl)
1916 else:
1917 return self.open(newurl, data)
1918
1919 def retry_http_basic_auth(self, url, realm, data=None):
1920 host, selector = urllib.parse.splithost(url)
1921 i = host.find('@') + 1
1922 host = host[i:]
1923 user, passwd = self.get_user_passwd(host, realm, i)
1924 if not (user or passwd): return None
1925 host = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1926 quote(passwd, safe=''), host)
1927 newurl = 'http://' + host + selector
1928 if data is None:
1929 return self.open(newurl)
1930 else:
1931 return self.open(newurl, data)
1932
1933 def retry_https_basic_auth(self, url, realm, data=None):
1934 host, selector = urllib.parse.splithost(url)
1935 i = host.find('@') + 1
1936 host = host[i:]
1937 user, passwd = self.get_user_passwd(host, realm, i)
1938 if not (user or passwd): return None
1939 host = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1940 quote(passwd, safe=''), host)
1941 newurl = 'https://' + host + selector
1942 if data is None:
1943 return self.open(newurl)
1944 else:
1945 return self.open(newurl, data)
1946
1947 def get_user_passwd(self, host, realm, clear_cache = 0):
1948 key = realm + '@' + host.lower()
1949 if key in self.auth_cache:
1950 if clear_cache:
1951 del self.auth_cache[key]
1952 else:
1953 return self.auth_cache[key]
1954 user, passwd = self.prompt_user_passwd(host, realm)
1955 if user or passwd: self.auth_cache[key] = (user, passwd)
1956 return user, passwd
1957
1958 def prompt_user_passwd(self, host, realm):
1959 """Override this in a GUI environment!"""
1960 import getpass
1961 try:
1962 user = input("Enter username for %s at %s: " % (realm, host))
1963 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
1964 (user, realm, host))
1965 return user, passwd
1966 except KeyboardInterrupt:
1967 print()
1968 return None, None
1969
1970
1971# Utility functions
1972
1973_localhost = None
1974def localhost():
1975 """Return the IP address of the magic hostname 'localhost'."""
1976 global _localhost
1977 if _localhost is None:
1978 _localhost = socket.gethostbyname('localhost')
1979 return _localhost
1980
1981_thishost = None
1982def thishost():
1983 """Return the IP address of the current host."""
1984 global _thishost
1985 if _thishost is None:
1986 _thishost = socket.gethostbyname(socket.gethostname())
1987 return _thishost
1988
1989_ftperrors = None
1990def ftperrors():
1991 """Return the set of errors raised by the FTP class."""
1992 global _ftperrors
1993 if _ftperrors is None:
1994 import ftplib
1995 _ftperrors = ftplib.all_errors
1996 return _ftperrors
1997
1998_noheaders = None
1999def noheaders():
2000 """Return an empty mimetools.Message object."""
2001 global _noheaders
2002 if _noheaders is None:
2003 _noheaders = mimetools.message_from_string("")
2004 return _noheaders
2005
2006
2007# Utility classes
2008
2009class ftpwrapper:
2010 """Class used by open_ftp() for cache of open FTP connections."""
2011
2012 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2013 self.user = user
2014 self.passwd = passwd
2015 self.host = host
2016 self.port = port
2017 self.dirs = dirs
2018 self.timeout = timeout
2019 self.init()
2020
2021 def init(self):
2022 import ftplib
2023 self.busy = 0
2024 self.ftp = ftplib.FTP()
2025 self.ftp.connect(self.host, self.port, self.timeout)
2026 self.ftp.login(self.user, self.passwd)
2027 for dir in self.dirs:
2028 self.ftp.cwd(dir)
2029
2030 def retrfile(self, file, type):
2031 import ftplib
2032 self.endtransfer()
2033 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2034 else: cmd = 'TYPE ' + type; isdir = 0
2035 try:
2036 self.ftp.voidcmd(cmd)
2037 except ftplib.all_errors:
2038 self.init()
2039 self.ftp.voidcmd(cmd)
2040 conn = None
2041 if file and not isdir:
2042 # Try to retrieve as a file
2043 try:
2044 cmd = 'RETR ' + file
2045 conn = self.ftp.ntransfercmd(cmd)
2046 except ftplib.error_perm as reason:
2047 if str(reason)[:3] != '550':
2048 raise urllib.error.URLError('ftp error', reason).with_traceback(sys.exc_info()[2])
2049 if not conn:
2050 # Set transfer mode to ASCII!
2051 self.ftp.voidcmd('TYPE A')
2052 # Try a directory listing. Verify that directory exists.
2053 if file:
2054 pwd = self.ftp.pwd()
2055 try:
2056 try:
2057 self.ftp.cwd(file)
2058 except ftplib.error_perm as reason:
2059 raise urllib.error.URLError('ftp error', reason) from reason
2060 finally:
2061 self.ftp.cwd(pwd)
2062 cmd = 'LIST ' + file
2063 else:
2064 cmd = 'LIST'
2065 conn = self.ftp.ntransfercmd(cmd)
2066 self.busy = 1
2067 # Pass back both a suitably decorated object and a retrieval length
2068 return (urllib.response.addclosehook(conn[0].makefile('rb'),
2069 self.endtransfer), conn[1])
2070 def endtransfer(self):
2071 if not self.busy:
2072 return
2073 self.busy = 0
2074 try:
2075 self.ftp.voidresp()
2076 except ftperrors():
2077 pass
2078
2079 def close(self):
2080 self.endtransfer()
2081 try:
2082 self.ftp.close()
2083 except ftperrors():
2084 pass
2085
2086# Proxy handling
2087def getproxies_environment():
2088 """Return a dictionary of scheme -> proxy server URL mappings.
2089
2090 Scan the environment for variables named <scheme>_proxy;
2091 this seems to be the standard convention. If you need a
2092 different way, you can pass a proxies dictionary to the
2093 [Fancy]URLopener constructor.
2094
2095 """
2096 proxies = {}
2097 for name, value in os.environ.items():
2098 name = name.lower()
2099 if name == 'no_proxy':
2100 # handled in proxy_bypass_environment
2101 continue
2102 if value and name[-6:] == '_proxy':
2103 proxies[name[:-6]] = value
2104 return proxies
2105
2106def proxy_bypass_environment(host):
2107 """Test if proxies should not be used for a particular host.
2108
2109 Checks the environment for a variable named no_proxy, which should
2110 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2111 """
2112 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2113 # '*' is special case for always bypass
2114 if no_proxy == '*':
2115 return 1
2116 # strip port off host
2117 hostonly, port = urllib.parse.splitport(host)
2118 # check if the host ends with any of the DNS suffixes
2119 for name in no_proxy.split(','):
2120 if name and (hostonly.endswith(name) or host.endswith(name)):
2121 return 1
2122 # otherwise, don't bypass
2123 return 0
2124
2125
2126if sys.platform == 'darwin':
2127 def getproxies_internetconfig():
2128 """Return a dictionary of scheme -> proxy server URL mappings.
2129
2130 By convention the mac uses Internet Config to store
2131 proxies. An HTTP proxy, for instance, is stored under
2132 the HttpProxy key.
2133
2134 """
2135 try:
2136 import ic
2137 except ImportError:
2138 return {}
2139
2140 try:
2141 config = ic.IC()
2142 except ic.error:
2143 return {}
2144 proxies = {}
2145 # HTTP:
2146 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
2147 try:
2148 value = config['HTTPProxyHost']
2149 except ic.error:
2150 pass
2151 else:
2152 proxies['http'] = 'http://%s' % value
2153 # FTP: XXX To be done.
2154 # Gopher: XXX To be done.
2155 return proxies
2156
2157 def proxy_bypass(host):
2158 if getproxies_environment():
2159 return proxy_bypass_environment(host)
2160 else:
2161 return 0
2162
2163 def getproxies():
2164 return getproxies_environment() or getproxies_internetconfig()
2165
2166elif os.name == 'nt':
2167 def getproxies_registry():
2168 """Return a dictionary of scheme -> proxy server URL mappings.
2169
2170 Win32 uses the registry to store proxies.
2171
2172 """
2173 proxies = {}
2174 try:
2175 import _winreg
2176 except ImportError:
2177 # Std module, so should be around - but you never know!
2178 return proxies
2179 try:
2180 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2181 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2182 proxyEnable = _winreg.QueryValueEx(internetSettings,
2183 'ProxyEnable')[0]
2184 if proxyEnable:
2185 # Returned as Unicode but problems if not converted to ASCII
2186 proxyServer = str(_winreg.QueryValueEx(internetSettings,
2187 'ProxyServer')[0])
2188 if '=' in proxyServer:
2189 # Per-protocol settings
2190 for p in proxyServer.split(';'):
2191 protocol, address = p.split('=', 1)
2192 # See if address has a type:// prefix
2193 import re
2194 if not re.match('^([^/:]+)://', address):
2195 address = '%s://%s' % (protocol, address)
2196 proxies[protocol] = address
2197 else:
2198 # Use one setting for all protocols
2199 if proxyServer[:5] == 'http:':
2200 proxies['http'] = proxyServer
2201 else:
2202 proxies['http'] = 'http://%s' % proxyServer
2203 proxies['ftp'] = 'ftp://%s' % proxyServer
2204 internetSettings.Close()
2205 except (WindowsError, ValueError, TypeError):
2206 # Either registry key not found etc, or the value in an
2207 # unexpected format.
2208 # proxies already set up to be empty so nothing to do
2209 pass
2210 return proxies
2211
2212 def getproxies():
2213 """Return a dictionary of scheme -> proxy server URL mappings.
2214
2215 Returns settings gathered from the environment, if specified,
2216 or the registry.
2217
2218 """
2219 return getproxies_environment() or getproxies_registry()
2220
2221 def proxy_bypass_registry(host):
2222 try:
2223 import _winreg
2224 import re
2225 except ImportError:
2226 # Std modules, so should be around - but you never know!
2227 return 0
2228 try:
2229 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2230 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2231 proxyEnable = _winreg.QueryValueEx(internetSettings,
2232 'ProxyEnable')[0]
2233 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
2234 'ProxyOverride')[0])
2235 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2236 except WindowsError:
2237 return 0
2238 if not proxyEnable or not proxyOverride:
2239 return 0
2240 # try to make a host list from name and IP address.
2241 rawHost, port = urllib.parse.splitport(host)
2242 host = [rawHost]
2243 try:
2244 addr = socket.gethostbyname(rawHost)
2245 if addr != rawHost:
2246 host.append(addr)
2247 except socket.error:
2248 pass
2249 try:
2250 fqdn = socket.getfqdn(rawHost)
2251 if fqdn != rawHost:
2252 host.append(fqdn)
2253 except socket.error:
2254 pass
2255 # make a check value list from the registry entry: replace the
2256 # '<local>' string by the localhost entry and the corresponding
2257 # canonical entry.
2258 proxyOverride = proxyOverride.split(';')
2259 i = 0
2260 while i < len(proxyOverride):
2261 if proxyOverride[i] == '<local>':
2262 proxyOverride[i:i+1] = ['localhost',
2263 '127.0.0.1',
2264 socket.gethostname(),
2265 socket.gethostbyname(
2266 socket.gethostname())]
2267 i += 1
2268 # print proxyOverride
2269 # now check if we match one of the registry values.
2270 for test in proxyOverride:
2271 test = test.replace(".", r"\.") # mask dots
2272 test = test.replace("*", r".*") # change glob sequence
2273 test = test.replace("?", r".") # change glob char
2274 for val in host:
2275 # print "%s <--> %s" %( test, val )
2276 if re.match(test, val, re.I):
2277 return 1
2278 return 0
2279
2280 def proxy_bypass(host):
2281 """Return a dictionary of scheme -> proxy server URL mappings.
2282
2283 Returns settings gathered from the environment, if specified,
2284 or the registry.
2285
2286 """
2287 if getproxies_environment():
2288 return proxy_bypass_environment(host)
2289 else:
2290 return proxy_bypass_registry(host)
2291
2292else:
2293 # By default use environment variables
2294 getproxies = getproxies_environment
2295 proxy_bypass = proxy_bypass_environment