blob: c050ed85c295608d907cc797bf13e1d2fbb1c3d8 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001# Issues in merging urllib and urllib2:
2# 1. They both define a function named urlopen()
3
4"""An extensible library for opening URLs using a variety of protocols
5
6The simplest way to use this module is to call the urlopen function,
7which accepts a string containing a URL or a Request object (described
8below). It opens the URL and returns the results as file-like
9object; the returned object has some extra methods described below.
10
11The OpenerDirector manages a collection of Handler objects that do
12all the actual work. Each Handler implements a particular protocol or
13option. The OpenerDirector is a composite object that invokes the
14Handlers needed to open the requested URL. For example, the
15HTTPHandler performs HTTP GET and POST requests and deals with
16non-error returns. The HTTPRedirectHandler automatically deals with
17HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
18deals with digest authentication.
19
20urlopen(url, data=None) -- Basic usage is the same as original
21urllib. pass the url and optionally data to post to an HTTP URL, and
22get a file-like object back. One difference is that you can also pass
23a Request instance instead of URL. Raises a URLError (subclass of
24IOError); for HTTP errors, raises an HTTPError, which can also be
25treated as a valid response.
26
27build_opener -- Function that creates a new OpenerDirector instance.
28Will install the default handlers. Accepts one or more Handlers as
29arguments, either instances or Handler classes that it will
30instantiate. If one of the argument is a subclass of the default
31handler, the argument will be installed instead of the default.
32
33install_opener -- Installs a new opener as the default opener.
34
35objects of interest:
36OpenerDirector --
37
38Request -- An object that encapsulates the state of a request. The
39state can be as simple as the URL. It can also include extra HTTP
40headers, e.g. a User-Agent.
41
42BaseHandler --
43
44internals:
45BaseHandler and parent
46_call_chain conventions
47
48Example usage:
49
Georg Brandl029986a2008-06-23 11:44:14 +000050import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051
52# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000053authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000054authinfo.add_password(realm='PDQ Application',
55 uri='https://mahler:8092/site-updates.py',
56 user='klem',
57 passwd='geheim$parole')
58
Georg Brandl029986a2008-06-23 11:44:14 +000059proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000060
61# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000062opener = urllib.request.build_opener(proxy_support, authinfo,
63 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064
65# install it
Georg Brandl029986a2008-06-23 11:44:14 +000066urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000067
Georg Brandl029986a2008-06-23 11:44:14 +000068f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000069"""
70
71# XXX issues:
72# If an authentication error handler that tries to perform
73# authentication for some reason but fails, how should the error be
74# signalled? The client needs to know the HTTP error code. But if
75# the handler knows that the problem was, e.g., that it didn't know
76# that hash algo that requested in the challenge, it would be good to
77# pass that information along to the client, too.
78# ftp errors aren't handled cleanly
79# check digest against correct (i.e. non-apache) implementation
80
81# Possible extensions:
82# complex proxies XXX not sure what exactly was meant by this
83# abstract factory for opener
84
85import base64
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097import bisect
98
Georg Brandl13e89462008-07-01 19:56:00 +000099from urllib.error import URLError, HTTPError, ContentTooShortError
100from urllib.parse import (
101 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
102 splittype, splithost, splitport, splituser, splitpasswd,
103 splitattr, splitquery, splitvalue, to_bytes)
104from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106# check for SSL
107try:
108 import ssl
109except:
110 _have_ssl = False
111else:
112 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114# used in User-Agent header sent
115__version__ = sys.version[:3]
116
117_opener = None
118def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
119 global _opener
120 if _opener is None:
121 _opener = build_opener()
122 return _opener.open(url, data, timeout)
123
124def install_opener(opener):
125 global _opener
126 _opener = opener
127
128# TODO(jhylton): Make this work with the same global opener.
129_urlopener = None
130def urlretrieve(url, filename=None, reporthook=None, data=None):
131 global _urlopener
132 if not _urlopener:
133 _urlopener = FancyURLopener()
134 return _urlopener.retrieve(url, filename, reporthook, data)
135
136def urlcleanup():
137 if _urlopener:
138 _urlopener.cleanup()
139 global _opener
140 if _opener:
141 _opener = None
142
143# copied from cookielib.py
144_cut_port_re = re.compile(r":\d+$")
145def request_host(request):
146 """Return request-host, as defined by RFC 2965.
147
148 Variation from RFC: returned value is lowercased, for convenient
149 comparison.
150
151 """
152 url = request.get_full_url()
Georg Brandl13e89462008-07-01 19:56:00 +0000153 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000154 if host == "":
155 host = request.get_header("Host", "")
156
157 # remove port, if present
158 host = _cut_port_re.sub("", host, 1)
159 return host.lower()
160
161class Request:
162
163 def __init__(self, url, data=None, headers={},
164 origin_req_host=None, unverifiable=False):
165 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Georg Brandl13e89462008-07-01 19:56:00 +0000166 self.__original = unwrap(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000167 self.type = None
168 # self.__r_type is what's left after doing the splittype
169 self.host = None
170 self.port = None
171 self.data = data
172 self.headers = {}
173 for key, value in headers.items():
174 self.add_header(key, value)
175 self.unredirected_hdrs = {}
176 if origin_req_host is None:
177 origin_req_host = request_host(self)
178 self.origin_req_host = origin_req_host
179 self.unverifiable = unverifiable
180
181 def __getattr__(self, attr):
182 # XXX this is a fallback mechanism to guard against these
183 # methods getting called in a non-standard order. this may be
184 # too complicated and/or unnecessary.
185 # XXX should the __r_XXX attributes be public?
186 if attr[:12] == '_Request__r_':
187 name = attr[12:]
188 if hasattr(Request, 'get_' + name):
189 getattr(self, 'get_' + name)()
190 return getattr(self, attr)
191 raise AttributeError(attr)
192
193 def get_method(self):
194 if self.has_data():
195 return "POST"
196 else:
197 return "GET"
198
199 # XXX these helper methods are lame
200
201 def add_data(self, data):
202 self.data = data
203
204 def has_data(self):
205 return self.data is not None
206
207 def get_data(self):
208 return self.data
209
210 def get_full_url(self):
211 return self.__original
212
213 def get_type(self):
214 if self.type is None:
Georg Brandl13e89462008-07-01 19:56:00 +0000215 self.type, self.__r_type = splittype(self.__original)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000216 if self.type is None:
217 raise ValueError("unknown url type: %s" % self.__original)
218 return self.type
219
220 def get_host(self):
221 if self.host is None:
Georg Brandl13e89462008-07-01 19:56:00 +0000222 self.host, self.__r_host = splithost(self.__r_type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000223 if self.host:
Georg Brandl13e89462008-07-01 19:56:00 +0000224 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000225 return self.host
226
227 def get_selector(self):
228 return self.__r_host
229
230 def set_proxy(self, host, type):
231 self.host, self.type = host, type
232 self.__r_host = self.__original
233
234 def get_origin_req_host(self):
235 return self.origin_req_host
236
237 def is_unverifiable(self):
238 return self.unverifiable
239
240 def add_header(self, key, val):
241 # useful for something like authentication
242 self.headers[key.capitalize()] = val
243
244 def add_unredirected_header(self, key, val):
245 # will not be added to a redirected request
246 self.unredirected_hdrs[key.capitalize()] = val
247
248 def has_header(self, header_name):
249 return (header_name in self.headers or
250 header_name in self.unredirected_hdrs)
251
252 def get_header(self, header_name, default=None):
253 return self.headers.get(
254 header_name,
255 self.unredirected_hdrs.get(header_name, default))
256
257 def header_items(self):
258 hdrs = self.unredirected_hdrs.copy()
259 hdrs.update(self.headers)
260 return list(hdrs.items())
261
262class OpenerDirector:
263 def __init__(self):
264 client_version = "Python-urllib/%s" % __version__
265 self.addheaders = [('User-agent', client_version)]
266 # manage the individual handlers
267 self.handlers = []
268 self.handle_open = {}
269 self.handle_error = {}
270 self.process_response = {}
271 self.process_request = {}
272
273 def add_handler(self, handler):
274 if not hasattr(handler, "add_parent"):
275 raise TypeError("expected BaseHandler instance, got %r" %
276 type(handler))
277
278 added = False
279 for meth in dir(handler):
280 if meth in ["redirect_request", "do_open", "proxy_open"]:
281 # oops, coincidental match
282 continue
283
284 i = meth.find("_")
285 protocol = meth[:i]
286 condition = meth[i+1:]
287
288 if condition.startswith("error"):
289 j = condition.find("_") + i + 1
290 kind = meth[j+1:]
291 try:
292 kind = int(kind)
293 except ValueError:
294 pass
295 lookup = self.handle_error.get(protocol, {})
296 self.handle_error[protocol] = lookup
297 elif condition == "open":
298 kind = protocol
299 lookup = self.handle_open
300 elif condition == "response":
301 kind = protocol
302 lookup = self.process_response
303 elif condition == "request":
304 kind = protocol
305 lookup = self.process_request
306 else:
307 continue
308
309 handlers = lookup.setdefault(kind, [])
310 if handlers:
311 bisect.insort(handlers, handler)
312 else:
313 handlers.append(handler)
314 added = True
315
316 if added:
317 # the handlers must work in an specific order, the order
318 # is specified in a Handler attribute
319 bisect.insort(self.handlers, handler)
320 handler.add_parent(self)
321
322 def close(self):
323 # Only exists for backwards compatibility.
324 pass
325
326 def _call_chain(self, chain, kind, meth_name, *args):
327 # Handlers raise an exception if no one else should try to handle
328 # the request, or return None if they can't but another handler
329 # could. Otherwise, they return the response.
330 handlers = chain.get(kind, ())
331 for handler in handlers:
332 func = getattr(handler, meth_name)
333
334 result = func(*args)
335 if result is not None:
336 return result
337
338 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
339 # accept a URL or a Request object
340 if isinstance(fullurl, str):
341 req = Request(fullurl, data)
342 else:
343 req = fullurl
344 if data is not None:
345 req.add_data(data)
346
347 req.timeout = timeout
348 protocol = req.get_type()
349
350 # pre-process request
351 meth_name = protocol+"_request"
352 for processor in self.process_request.get(protocol, []):
353 meth = getattr(processor, meth_name)
354 req = meth(req)
355
356 response = self._open(req, data)
357
358 # post-process response
359 meth_name = protocol+"_response"
360 for processor in self.process_response.get(protocol, []):
361 meth = getattr(processor, meth_name)
362 response = meth(req, response)
363
364 return response
365
366 def _open(self, req, data=None):
367 result = self._call_chain(self.handle_open, 'default',
368 'default_open', req)
369 if result:
370 return result
371
372 protocol = req.get_type()
373 result = self._call_chain(self.handle_open, protocol, protocol +
374 '_open', req)
375 if result:
376 return result
377
378 return self._call_chain(self.handle_open, 'unknown',
379 'unknown_open', req)
380
381 def error(self, proto, *args):
382 if proto in ('http', 'https'):
383 # XXX http[s] protocols are special-cased
384 dict = self.handle_error['http'] # https is not different than http
385 proto = args[2] # YUCK!
386 meth_name = 'http_error_%s' % proto
387 http_err = 1
388 orig_args = args
389 else:
390 dict = self.handle_error
391 meth_name = proto + '_error'
392 http_err = 0
393 args = (dict, proto, meth_name) + args
394 result = self._call_chain(*args)
395 if result:
396 return result
397
398 if http_err:
399 args = (dict, 'default', 'http_error_default') + orig_args
400 return self._call_chain(*args)
401
402# XXX probably also want an abstract factory that knows when it makes
403# sense to skip a superclass in favor of a subclass and when it might
404# make sense to include both
405
406def build_opener(*handlers):
407 """Create an opener object from a list of handlers.
408
409 The opener will use several default handlers, including support
410 for HTTP and FTP.
411
412 If any of the handlers passed as arguments are subclasses of the
413 default handlers, the default handlers will not be used.
414 """
415 def isclass(obj):
416 return isinstance(obj, type) or hasattr(obj, "__bases__")
417
418 opener = OpenerDirector()
419 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
420 HTTPDefaultErrorHandler, HTTPRedirectHandler,
421 FTPHandler, FileHandler, HTTPErrorProcessor]
422 if hasattr(http.client, "HTTPSConnection"):
423 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000424 skip = set()
425 for klass in default_classes:
426 for check in handlers:
427 if isclass(check):
428 if issubclass(check, klass):
429 skip.add(klass)
430 elif isinstance(check, klass):
431 skip.add(klass)
432 for klass in skip:
433 default_classes.remove(klass)
434
435 for klass in default_classes:
436 opener.add_handler(klass())
437
438 for h in handlers:
439 if isclass(h):
440 h = h()
441 opener.add_handler(h)
442 return opener
443
444class BaseHandler:
445 handler_order = 500
446
447 def add_parent(self, parent):
448 self.parent = parent
449
450 def close(self):
451 # Only exists for backwards compatibility
452 pass
453
454 def __lt__(self, other):
455 if not hasattr(other, "handler_order"):
456 # Try to preserve the old behavior of having custom classes
457 # inserted after default ones (works only for custom user
458 # classes which are not aware of handler_order).
459 return True
460 return self.handler_order < other.handler_order
461
462
463class HTTPErrorProcessor(BaseHandler):
464 """Process HTTP error responses."""
465 handler_order = 1000 # after all other processing
466
467 def http_response(self, request, response):
468 code, msg, hdrs = response.code, response.msg, response.info()
469
470 # According to RFC 2616, "2xx" code indicates that the client's
471 # request was successfully received, understood, and accepted.
472 if not (200 <= code < 300):
473 response = self.parent.error(
474 'http', request, response, code, msg, hdrs)
475
476 return response
477
478 https_response = http_response
479
480class HTTPDefaultErrorHandler(BaseHandler):
481 def http_error_default(self, req, fp, code, msg, hdrs):
Georg Brandl13e89462008-07-01 19:56:00 +0000482 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000483
484class HTTPRedirectHandler(BaseHandler):
485 # maximum number of redirections to any single URL
486 # this is needed because of the state that cookies introduce
487 max_repeats = 4
488 # maximum total number of redirections (regardless of URL) before
489 # assuming we're in a loop
490 max_redirections = 10
491
492 def redirect_request(self, req, fp, code, msg, headers, newurl):
493 """Return a Request or None in response to a redirect.
494
495 This is called by the http_error_30x methods when a
496 redirection response is received. If a redirection should
497 take place, return a new Request to allow http_error_30x to
498 perform the redirect. Otherwise, raise HTTPError if no-one
499 else should try to handle this url. Return None if you can't
500 but another Handler might.
501 """
502 m = req.get_method()
503 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
504 or code in (301, 302, 303) and m == "POST")):
Georg Brandl13e89462008-07-01 19:56:00 +0000505 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000506
507 # Strictly (according to RFC 2616), 301 or 302 in response to
508 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000509 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000510 # essentially all clients do redirect in this case, so we do
511 # the same.
512 # be conciliant with URIs containing a space
513 newurl = newurl.replace(' ', '%20')
514 CONTENT_HEADERS = ("content-length", "content-type")
515 newheaders = dict((k, v) for k, v in req.headers.items()
516 if k.lower() not in CONTENT_HEADERS)
517 return Request(newurl,
518 headers=newheaders,
519 origin_req_host=req.get_origin_req_host(),
520 unverifiable=True)
521
522 # Implementation note: To avoid the server sending us into an
523 # infinite loop, the request object needs to track what URLs we
524 # have already seen. Do this by adding a handler-specific
525 # attribute to the Request object.
526 def http_error_302(self, req, fp, code, msg, headers):
527 # Some servers (incorrectly) return multiple Location headers
528 # (so probably same goes for URI). Use first header.
529 if "location" in headers:
530 newurl = headers["location"]
531 elif "uri" in headers:
532 newurl = headers["uri"]
533 else:
534 return
Georg Brandl13e89462008-07-01 19:56:00 +0000535 newurl = urljoin(req.get_full_url(), newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000536
537 # XXX Probably want to forget about the state of the current
538 # request, although that might interact poorly with other
539 # handlers that also use handler-specific request attributes
540 new = self.redirect_request(req, fp, code, msg, headers, newurl)
541 if new is None:
542 return
543
544 # loop detection
545 # .redirect_dict has a key url if url was previously visited.
546 if hasattr(req, 'redirect_dict'):
547 visited = new.redirect_dict = req.redirect_dict
548 if (visited.get(newurl, 0) >= self.max_repeats or
549 len(visited) >= self.max_redirections):
Georg Brandl13e89462008-07-01 19:56:00 +0000550 raise HTTPError(req.get_full_url(), code,
551 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000552 else:
553 visited = new.redirect_dict = req.redirect_dict = {}
554 visited[newurl] = visited.get(newurl, 0) + 1
555
556 # Don't close the fp until we are sure that we won't use it
557 # with HTTPError.
558 fp.read()
559 fp.close()
560
561 return self.parent.open(new)
562
563 http_error_301 = http_error_303 = http_error_307 = http_error_302
564
565 inf_msg = "The HTTP server returned a redirect error that would " \
566 "lead to an infinite loop.\n" \
567 "The last 30x error message was:\n"
568
569
570def _parse_proxy(proxy):
571 """Return (scheme, user, password, host/port) given a URL or an authority.
572
573 If a URL is supplied, it must have an authority (host:port) component.
574 According to RFC 3986, having an authority component means the URL must
575 have two slashes after the scheme:
576
577 >>> _parse_proxy('file:/ftp.example.com/')
578 Traceback (most recent call last):
579 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
580
581 The first three items of the returned tuple may be None.
582
583 Examples of authority parsing:
584
585 >>> _parse_proxy('proxy.example.com')
586 (None, None, None, 'proxy.example.com')
587 >>> _parse_proxy('proxy.example.com:3128')
588 (None, None, None, 'proxy.example.com:3128')
589
590 The authority component may optionally include userinfo (assumed to be
591 username:password):
592
593 >>> _parse_proxy('joe:password@proxy.example.com')
594 (None, 'joe', 'password', 'proxy.example.com')
595 >>> _parse_proxy('joe:password@proxy.example.com:3128')
596 (None, 'joe', 'password', 'proxy.example.com:3128')
597
598 Same examples, but with URLs instead:
599
600 >>> _parse_proxy('http://proxy.example.com/')
601 ('http', None, None, 'proxy.example.com')
602 >>> _parse_proxy('http://proxy.example.com:3128/')
603 ('http', None, None, 'proxy.example.com:3128')
604 >>> _parse_proxy('http://joe:password@proxy.example.com/')
605 ('http', 'joe', 'password', 'proxy.example.com')
606 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
607 ('http', 'joe', 'password', 'proxy.example.com:3128')
608
609 Everything after the authority is ignored:
610
611 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
612 ('ftp', 'joe', 'password', 'proxy.example.com')
613
614 Test for no trailing '/' case:
615
616 >>> _parse_proxy('http://joe:password@proxy.example.com')
617 ('http', 'joe', 'password', 'proxy.example.com')
618
619 """
Georg Brandl13e89462008-07-01 19:56:00 +0000620 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000621 if not r_scheme.startswith("/"):
622 # authority
623 scheme = None
624 authority = proxy
625 else:
626 # URL
627 if not r_scheme.startswith("//"):
628 raise ValueError("proxy URL with no authority: %r" % proxy)
629 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
630 # and 3.3.), path is empty or starts with '/'
631 end = r_scheme.find("/", 2)
632 if end == -1:
633 end = None
634 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000635 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000636 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000637 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000638 else:
639 user = password = None
640 return scheme, user, password, hostport
641
642class ProxyHandler(BaseHandler):
643 # Proxies must be in front
644 handler_order = 100
645
646 def __init__(self, proxies=None):
647 if proxies is None:
648 proxies = getproxies()
649 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
650 self.proxies = proxies
651 for type, url in proxies.items():
652 setattr(self, '%s_open' % type,
653 lambda r, proxy=url, type=type, meth=self.proxy_open: \
654 meth(r, proxy, type))
655
656 def proxy_open(self, req, proxy, type):
657 orig_type = req.get_type()
658 proxy_type, user, password, hostport = _parse_proxy(proxy)
659 if proxy_type is None:
660 proxy_type = orig_type
661 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000662 user_pass = '%s:%s' % (unquote(user),
663 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000664 creds = base64.b64encode(user_pass.encode()).decode("ascii")
665 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000666 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000667 req.set_proxy(hostport, proxy_type)
668 if orig_type == proxy_type:
669 # let other handlers take care of it
670 return None
671 else:
672 # need to start over, because the other handlers don't
673 # grok the proxy's URL type
674 # e.g. if we have a constructor arg proxies like so:
675 # {'http': 'ftp://proxy.example.com'}, we may end up turning
676 # a request for http://acme.example.com/a into one for
677 # ftp://proxy.example.com/a
678 return self.parent.open(req)
679
680class HTTPPasswordMgr:
681
682 def __init__(self):
683 self.passwd = {}
684
685 def add_password(self, realm, uri, user, passwd):
686 # uri could be a single URI or a sequence
687 if isinstance(uri, str):
688 uri = [uri]
689 if not realm in self.passwd:
690 self.passwd[realm] = {}
691 for default_port in True, False:
692 reduced_uri = tuple(
693 [self.reduce_uri(u, default_port) for u in uri])
694 self.passwd[realm][reduced_uri] = (user, passwd)
695
696 def find_user_password(self, realm, authuri):
697 domains = self.passwd.get(realm, {})
698 for default_port in True, False:
699 reduced_authuri = self.reduce_uri(authuri, default_port)
700 for uris, authinfo in domains.items():
701 for uri in uris:
702 if self.is_suburi(uri, reduced_authuri):
703 return authinfo
704 return None, None
705
706 def reduce_uri(self, uri, default_port=True):
707 """Accept authority or URI and extract only the authority and path."""
708 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000709 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000710 if parts[1]:
711 # URI
712 scheme = parts[0]
713 authority = parts[1]
714 path = parts[2] or '/'
715 else:
716 # host or host:port
717 scheme = None
718 authority = uri
719 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000720 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000721 if default_port and port is None and scheme is not None:
722 dport = {"http": 80,
723 "https": 443,
724 }.get(scheme)
725 if dport is not None:
726 authority = "%s:%d" % (host, dport)
727 return authority, path
728
729 def is_suburi(self, base, test):
730 """Check if test is below base in a URI tree
731
732 Both args must be URIs in reduced form.
733 """
734 if base == test:
735 return True
736 if base[0] != test[0]:
737 return False
738 common = posixpath.commonprefix((base[1], test[1]))
739 if len(common) == len(base[1]):
740 return True
741 return False
742
743
744class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
745
746 def find_user_password(self, realm, authuri):
747 user, password = HTTPPasswordMgr.find_user_password(self, realm,
748 authuri)
749 if user is not None:
750 return user, password
751 return HTTPPasswordMgr.find_user_password(self, None, authuri)
752
753
754class AbstractBasicAuthHandler:
755
756 # XXX this allows for multiple auth-schemes, but will stupidly pick
757 # the last one with a realm specified.
758
759 # allow for double- and single-quoted realm values
760 # (single quotes are a violation of the RFC, but appear in the wild)
761 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
762 'realm=(["\'])(.*?)\\2', re.I)
763
764 # XXX could pre-emptively send auth info already accepted (RFC 2617,
765 # end of section 2, and section 1.2 immediately after "credentials"
766 # production).
767
768 def __init__(self, password_mgr=None):
769 if password_mgr is None:
770 password_mgr = HTTPPasswordMgr()
771 self.passwd = password_mgr
772 self.add_password = self.passwd.add_password
773
774 def http_error_auth_reqed(self, authreq, host, req, headers):
775 # host may be an authority (without userinfo) or a URL with an
776 # authority
777 # XXX could be multiple headers
778 authreq = headers.get(authreq, None)
779 if authreq:
780 mo = AbstractBasicAuthHandler.rx.search(authreq)
781 if mo:
782 scheme, quote, realm = mo.groups()
783 if scheme.lower() == 'basic':
784 return self.retry_http_basic_auth(host, req, realm)
785
786 def retry_http_basic_auth(self, host, req, realm):
787 user, pw = self.passwd.find_user_password(realm, host)
788 if pw is not None:
789 raw = "%s:%s" % (user, pw)
790 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
791 if req.headers.get(self.auth_header, None) == auth:
792 return None
793 req.add_header(self.auth_header, auth)
794 return self.parent.open(req)
795 else:
796 return None
797
798
799class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
800
801 auth_header = 'Authorization'
802
803 def http_error_401(self, req, fp, code, msg, headers):
804 url = req.get_full_url()
805 return self.http_error_auth_reqed('www-authenticate',
806 url, req, headers)
807
808
809class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
810
811 auth_header = 'Proxy-authorization'
812
813 def http_error_407(self, req, fp, code, msg, headers):
814 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000815 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000816 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
817 # userinfo.
818 authority = req.get_host()
819 return self.http_error_auth_reqed('proxy-authenticate',
820 authority, req, headers)
821
822
823def randombytes(n):
824 """Return n random bytes."""
825 return os.urandom(n)
826
827class AbstractDigestAuthHandler:
828 # Digest authentication is specified in RFC 2617.
829
830 # XXX The client does not inspect the Authentication-Info header
831 # in a successful response.
832
833 # XXX It should be possible to test this implementation against
834 # a mock server that just generates a static set of challenges.
835
836 # XXX qop="auth-int" supports is shaky
837
838 def __init__(self, passwd=None):
839 if passwd is None:
840 passwd = HTTPPasswordMgr()
841 self.passwd = passwd
842 self.add_password = self.passwd.add_password
843 self.retried = 0
844 self.nonce_count = 0
845
846 def reset_retry_count(self):
847 self.retried = 0
848
849 def http_error_auth_reqed(self, auth_header, host, req, headers):
850 authreq = headers.get(auth_header, None)
851 if self.retried > 5:
852 # Don't fail endlessly - if we failed once, we'll probably
853 # fail a second time. Hm. Unless the Password Manager is
854 # prompting for the information. Crap. This isn't great
855 # but it's better than the current 'repeat until recursion
856 # depth exceeded' approach <wink>
Georg Brandl13e89462008-07-01 19:56:00 +0000857 raise HTTPError(req.get_full_url(), 401, "digest auth failed",
858 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000859 else:
860 self.retried += 1
861 if authreq:
862 scheme = authreq.split()[0]
863 if scheme.lower() == 'digest':
864 return self.retry_http_digest_auth(req, authreq)
865
866 def retry_http_digest_auth(self, req, auth):
867 token, challenge = auth.split(' ', 1)
868 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
869 auth = self.get_authorization(req, chal)
870 if auth:
871 auth_val = 'Digest %s' % auth
872 if req.headers.get(self.auth_header, None) == auth_val:
873 return None
874 req.add_unredirected_header(self.auth_header, auth_val)
875 resp = self.parent.open(req)
876 return resp
877
878 def get_cnonce(self, nonce):
879 # The cnonce-value is an opaque
880 # quoted string value provided by the client and used by both client
881 # and server to avoid chosen plaintext attacks, to provide mutual
882 # authentication, and to provide some message integrity protection.
883 # This isn't a fabulous effort, but it's probably Good Enough.
884 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
885 b = s.encode("ascii") + randombytes(8)
886 dig = hashlib.sha1(b).hexdigest()
887 return dig[:16]
888
889 def get_authorization(self, req, chal):
890 try:
891 realm = chal['realm']
892 nonce = chal['nonce']
893 qop = chal.get('qop')
894 algorithm = chal.get('algorithm', 'MD5')
895 # mod_digest doesn't send an opaque, even though it isn't
896 # supposed to be optional
897 opaque = chal.get('opaque', None)
898 except KeyError:
899 return None
900
901 H, KD = self.get_algorithm_impls(algorithm)
902 if H is None:
903 return None
904
905 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
906 if user is None:
907 return None
908
909 # XXX not implemented yet
910 if req.has_data():
911 entdig = self.get_entity_digest(req.get_data(), chal)
912 else:
913 entdig = None
914
915 A1 = "%s:%s:%s" % (user, realm, pw)
916 A2 = "%s:%s" % (req.get_method(),
917 # XXX selector: what about proxies and full urls
918 req.get_selector())
919 if qop == 'auth':
920 self.nonce_count += 1
921 ncvalue = '%08x' % self.nonce_count
922 cnonce = self.get_cnonce(nonce)
923 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
924 respdig = KD(H(A1), noncebit)
925 elif qop is None:
926 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
927 else:
928 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000929 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000930
931 # XXX should the partial digests be encoded too?
932
933 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
934 'response="%s"' % (user, realm, nonce, req.get_selector(),
935 respdig)
936 if opaque:
937 base += ', opaque="%s"' % opaque
938 if entdig:
939 base += ', digest="%s"' % entdig
940 base += ', algorithm="%s"' % algorithm
941 if qop:
942 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
943 return base
944
945 def get_algorithm_impls(self, algorithm):
946 # lambdas assume digest modules are imported at the top level
947 if algorithm == 'MD5':
948 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
949 elif algorithm == 'SHA':
950 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
951 # XXX MD5-sess
952 KD = lambda s, d: H("%s:%s" % (s, d))
953 return H, KD
954
955 def get_entity_digest(self, data, chal):
956 # XXX not implemented yet
957 return None
958
959
960class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
961 """An authentication protocol defined by RFC 2069
962
963 Digest authentication improves on basic authentication because it
964 does not transmit passwords in the clear.
965 """
966
967 auth_header = 'Authorization'
968 handler_order = 490 # before Basic auth
969
970 def http_error_401(self, req, fp, code, msg, headers):
Georg Brandl13e89462008-07-01 19:56:00 +0000971 host = urlparse(req.get_full_url())[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000972 retry = self.http_error_auth_reqed('www-authenticate',
973 host, req, headers)
974 self.reset_retry_count()
975 return retry
976
977
978class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
979
980 auth_header = 'Proxy-Authorization'
981 handler_order = 490 # before Basic auth
982
983 def http_error_407(self, req, fp, code, msg, headers):
984 host = req.get_host()
985 retry = self.http_error_auth_reqed('proxy-authenticate',
986 host, req, headers)
987 self.reset_retry_count()
988 return retry
989
990class AbstractHTTPHandler(BaseHandler):
991
992 def __init__(self, debuglevel=0):
993 self._debuglevel = debuglevel
994
995 def set_http_debuglevel(self, level):
996 self._debuglevel = level
997
998 def do_request_(self, request):
999 host = request.get_host()
1000 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001001 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001002
1003 if request.has_data(): # POST
1004 data = request.get_data()
1005 if not request.has_header('Content-type'):
1006 request.add_unredirected_header(
1007 'Content-type',
1008 'application/x-www-form-urlencoded')
1009 if not request.has_header('Content-length'):
1010 request.add_unredirected_header(
1011 'Content-length', '%d' % len(data))
1012
Georg Brandl13e89462008-07-01 19:56:00 +00001013 scheme, sel = splittype(request.get_selector())
1014 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001015 if not request.has_header('Host'):
1016 request.add_unredirected_header('Host', sel_host or host)
1017 for name, value in self.parent.addheaders:
1018 name = name.capitalize()
1019 if not request.has_header(name):
1020 request.add_unredirected_header(name, value)
1021
1022 return request
1023
1024 def do_open(self, http_class, req):
1025 """Return an addinfourl object for the request, using http_class.
1026
1027 http_class must implement the HTTPConnection API from http.client.
1028 The addinfourl return value is a file-like object. It also
1029 has methods and attributes including:
Georg Brandl13e89462008-07-01 19:56:00 +00001030 - info(): return a email Message object for the headers
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001031 - geturl(): return the original request URL
1032 - code: HTTP status code
1033 """
1034 host = req.get_host()
1035 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001036 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001037
1038 h = http_class(host, timeout=req.timeout) # will parse host:port
1039 headers = dict(req.headers)
1040 headers.update(req.unredirected_hdrs)
1041
1042 # TODO(jhylton): Should this be redesigned to handle
1043 # persistent connections?
1044
1045 # We want to make an HTTP/1.1 request, but the addinfourl
1046 # class isn't prepared to deal with a persistent connection.
1047 # It will try to read all remaining data from the socket,
1048 # which will block while the server waits for the next request.
1049 # So make sure the connection gets closed after the (only)
1050 # request.
1051 headers["Connection"] = "close"
1052 headers = dict(
1053 (name.title(), val) for name, val in headers.items())
1054 try:
1055 h.request(req.get_method(), req.get_selector(), req.data, headers)
1056 r = h.getresponse()
1057 except socket.error as err: # XXX what error?
Georg Brandl13e89462008-07-01 19:56:00 +00001058 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001059
Georg Brandl13e89462008-07-01 19:56:00 +00001060 resp = addinfourl(r.fp, r.msg, req.get_full_url())
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001061 resp.code = r.status
1062 resp.msg = r.reason
1063 return resp
1064
1065
1066class HTTPHandler(AbstractHTTPHandler):
1067
1068 def http_open(self, req):
1069 return self.do_open(http.client.HTTPConnection, req)
1070
1071 http_request = AbstractHTTPHandler.do_request_
1072
1073if hasattr(http.client, 'HTTPSConnection'):
1074 class HTTPSHandler(AbstractHTTPHandler):
1075
1076 def https_open(self, req):
1077 return self.do_open(http.client.HTTPSConnection, req)
1078
1079 https_request = AbstractHTTPHandler.do_request_
1080
1081class HTTPCookieProcessor(BaseHandler):
1082 def __init__(self, cookiejar=None):
1083 import http.cookiejar
1084 if cookiejar is None:
1085 cookiejar = http.cookiejar.CookieJar()
1086 self.cookiejar = cookiejar
1087
1088 def http_request(self, request):
1089 self.cookiejar.add_cookie_header(request)
1090 return request
1091
1092 def http_response(self, request, response):
1093 self.cookiejar.extract_cookies(response, request)
1094 return response
1095
1096 https_request = http_request
1097 https_response = http_response
1098
1099class UnknownHandler(BaseHandler):
1100 def unknown_open(self, req):
1101 type = req.get_type()
Georg Brandl13e89462008-07-01 19:56:00 +00001102 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001103
1104def parse_keqv_list(l):
1105 """Parse list of key=value strings where keys are not duplicated."""
1106 parsed = {}
1107 for elt in l:
1108 k, v = elt.split('=', 1)
1109 if v[0] == '"' and v[-1] == '"':
1110 v = v[1:-1]
1111 parsed[k] = v
1112 return parsed
1113
1114def parse_http_list(s):
1115 """Parse lists as described by RFC 2068 Section 2.
1116
1117 In particular, parse comma-separated lists where the elements of
1118 the list may include quoted-strings. A quoted-string could
1119 contain a comma. A non-quoted string could have quotes in the
1120 middle. Neither commas nor quotes count if they are escaped.
1121 Only double-quotes count, not single-quotes.
1122 """
1123 res = []
1124 part = ''
1125
1126 escape = quote = False
1127 for cur in s:
1128 if escape:
1129 part += cur
1130 escape = False
1131 continue
1132 if quote:
1133 if cur == '\\':
1134 escape = True
1135 continue
1136 elif cur == '"':
1137 quote = False
1138 part += cur
1139 continue
1140
1141 if cur == ',':
1142 res.append(part)
1143 part = ''
1144 continue
1145
1146 if cur == '"':
1147 quote = True
1148
1149 part += cur
1150
1151 # append last part
1152 if part:
1153 res.append(part)
1154
1155 return [part.strip() for part in res]
1156
1157class FileHandler(BaseHandler):
1158 # Use local file or FTP depending on form of URL
1159 def file_open(self, req):
1160 url = req.get_selector()
1161 if url[:2] == '//' and url[2:3] != '/':
1162 req.type = 'ftp'
1163 return self.parent.open(req)
1164 else:
1165 return self.open_local_file(req)
1166
1167 # names for the localhost
1168 names = None
1169 def get_names(self):
1170 if FileHandler.names is None:
1171 try:
1172 FileHandler.names = (socket.gethostbyname('localhost'),
1173 socket.gethostbyname(socket.gethostname()))
1174 except socket.gaierror:
1175 FileHandler.names = (socket.gethostbyname('localhost'),)
1176 return FileHandler.names
1177
1178 # not entirely sure what the rules are here
1179 def open_local_file(self, req):
1180 import email.utils
1181 import mimetypes
1182 host = req.get_host()
1183 file = req.get_selector()
1184 localfile = url2pathname(file)
1185 try:
1186 stats = os.stat(localfile)
1187 size = stats.st_size
1188 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1189 mtype = mimetypes.guess_type(file)[0]
1190 headers = email.message_from_string(
1191 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1192 (mtype or 'text/plain', size, modified))
1193 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001194 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001195 if not host or \
1196 (not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl13e89462008-07-01 19:56:00 +00001197 return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001198 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001199 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001200 raise URLError(msg)
1201 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001202
1203def _safe_gethostbyname(host):
1204 try:
1205 return socket.gethostbyname(host)
1206 except socket.gaierror:
1207 return None
1208
1209class FTPHandler(BaseHandler):
1210 def ftp_open(self, req):
1211 import ftplib
1212 import mimetypes
1213 host = req.get_host()
1214 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001215 raise URLError('ftp error: no host given')
1216 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001217 if port is None:
1218 port = ftplib.FTP_PORT
1219 else:
1220 port = int(port)
1221
1222 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001223 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001224 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001225 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001226 else:
1227 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001228 host = unquote(host)
1229 user = unquote(user or '')
1230 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001231
1232 try:
1233 host = socket.gethostbyname(host)
1234 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001235 raise URLError(msg)
1236 path, attrs = splitattr(req.get_selector())
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001237 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001238 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001239 dirs, file = dirs[:-1], dirs[-1]
1240 if dirs and not dirs[0]:
1241 dirs = dirs[1:]
1242 try:
1243 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1244 type = file and 'I' or 'D'
1245 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001246 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001247 if attr.lower() == 'type' and \
1248 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1249 type = value.upper()
1250 fp, retrlen = fw.retrfile(file, type)
1251 headers = ""
1252 mtype = mimetypes.guess_type(req.get_full_url())[0]
1253 if mtype:
1254 headers += "Content-type: %s\n" % mtype
1255 if retrlen is not None and retrlen >= 0:
1256 headers += "Content-length: %d\n" % retrlen
1257 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001258 return addinfourl(fp, headers, req.get_full_url())
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001259 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001260 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001261 raise exc.with_traceback(sys.exc_info()[2])
1262
1263 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1264 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1265 return fw
1266
1267class CacheFTPHandler(FTPHandler):
1268 # XXX would be nice to have pluggable cache strategies
1269 # XXX this stuff is definitely not thread safe
1270 def __init__(self):
1271 self.cache = {}
1272 self.timeout = {}
1273 self.soonest = 0
1274 self.delay = 60
1275 self.max_conns = 16
1276
1277 def setTimeout(self, t):
1278 self.delay = t
1279
1280 def setMaxConns(self, m):
1281 self.max_conns = m
1282
1283 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1284 key = user, host, port, '/'.join(dirs), timeout
1285 if key in self.cache:
1286 self.timeout[key] = time.time() + self.delay
1287 else:
1288 self.cache[key] = ftpwrapper(user, passwd, host, port,
1289 dirs, timeout)
1290 self.timeout[key] = time.time() + self.delay
1291 self.check_cache()
1292 return self.cache[key]
1293
1294 def check_cache(self):
1295 # first check for old ones
1296 t = time.time()
1297 if self.soonest <= t:
1298 for k, v in list(self.timeout.items()):
1299 if v < t:
1300 self.cache[k].close()
1301 del self.cache[k]
1302 del self.timeout[k]
1303 self.soonest = min(list(self.timeout.values()))
1304
1305 # then check the size
1306 if len(self.cache) == self.max_conns:
1307 for k, v in list(self.timeout.items()):
1308 if v == self.soonest:
1309 del self.cache[k]
1310 del self.timeout[k]
1311 break
1312 self.soonest = min(list(self.timeout.values()))
1313
1314# Code move from the old urllib module
1315
1316MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1317
1318# Helper for non-unix systems
1319if os.name == 'mac':
1320 from macurl2path import url2pathname, pathname2url
1321elif os.name == 'nt':
1322 from nturl2path import url2pathname, pathname2url
1323else:
1324 def url2pathname(pathname):
1325 """OS-specific conversion from a relative URL of the 'file' scheme
1326 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001327 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001328
1329 def pathname2url(pathname):
1330 """OS-specific conversion from a file system path to a relative URL
1331 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001332 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001333
1334# This really consists of two pieces:
1335# (1) a class which handles opening of all sorts of URLs
1336# (plus assorted utilities etc.)
1337# (2) a set of functions for parsing URLs
1338# XXX Should these be separated out into different modules?
1339
1340
1341ftpcache = {}
1342class URLopener:
1343 """Class to open URLs.
1344 This is a class rather than just a subroutine because we may need
1345 more than one set of global protocol-specific options.
1346 Note -- this is a base class for those who don't want the
1347 automatic handling of errors type 302 (relocated) and 401
1348 (authorization needed)."""
1349
1350 __tempfiles = None
1351
1352 version = "Python-urllib/%s" % __version__
1353
1354 # Constructor
1355 def __init__(self, proxies=None, **x509):
1356 if proxies is None:
1357 proxies = getproxies()
1358 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1359 self.proxies = proxies
1360 self.key_file = x509.get('key_file')
1361 self.cert_file = x509.get('cert_file')
1362 self.addheaders = [('User-Agent', self.version)]
1363 self.__tempfiles = []
1364 self.__unlink = os.unlink # See cleanup()
1365 self.tempcache = None
1366 # Undocumented feature: if you assign {} to tempcache,
1367 # it is used to cache files retrieved with
1368 # self.retrieve(). This is not enabled by default
1369 # since it does not work for changing documents (and I
1370 # haven't got the logic to check expiration headers
1371 # yet).
1372 self.ftpcache = ftpcache
1373 # Undocumented feature: you can use a different
1374 # ftp cache by assigning to the .ftpcache member;
1375 # in case you want logically independent URL openers
1376 # XXX This is not threadsafe. Bah.
1377
1378 def __del__(self):
1379 self.close()
1380
1381 def close(self):
1382 self.cleanup()
1383
1384 def cleanup(self):
1385 # This code sometimes runs when the rest of this module
1386 # has already been deleted, so it can't use any globals
1387 # or import anything.
1388 if self.__tempfiles:
1389 for file in self.__tempfiles:
1390 try:
1391 self.__unlink(file)
1392 except OSError:
1393 pass
1394 del self.__tempfiles[:]
1395 if self.tempcache:
1396 self.tempcache.clear()
1397
1398 def addheader(self, *args):
1399 """Add a header to be used by the HTTP interface only
1400 e.g. u.addheader('Accept', 'sound/basic')"""
1401 self.addheaders.append(args)
1402
1403 # External interface
1404 def open(self, fullurl, data=None):
1405 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001406 fullurl = unwrap(to_bytes(fullurl))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001407 if self.tempcache and fullurl in self.tempcache:
1408 filename, headers = self.tempcache[fullurl]
1409 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001410 return addinfourl(fp, headers, fullurl)
1411 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001412 if not urltype:
1413 urltype = 'file'
1414 if urltype in self.proxies:
1415 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001416 urltype, proxyhost = splittype(proxy)
1417 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001418 url = (host, fullurl) # Signal special case to open_*()
1419 else:
1420 proxy = None
1421 name = 'open_' + urltype
1422 self.type = urltype
1423 name = name.replace('-', '_')
1424 if not hasattr(self, name):
1425 if proxy:
1426 return self.open_unknown_proxy(proxy, fullurl, data)
1427 else:
1428 return self.open_unknown(fullurl, data)
1429 try:
1430 if data is None:
1431 return getattr(self, name)(url)
1432 else:
1433 return getattr(self, name)(url, data)
1434 except socket.error as msg:
1435 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1436
1437 def open_unknown(self, fullurl, data=None):
1438 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001439 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001440 raise IOError('url error', 'unknown url type', type)
1441
1442 def open_unknown_proxy(self, proxy, fullurl, data=None):
1443 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001444 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001445 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1446
1447 # External interface
1448 def retrieve(self, url, filename=None, reporthook=None, data=None):
1449 """retrieve(url) returns (filename, headers) for a local object
1450 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001451 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001452 if self.tempcache and url in self.tempcache:
1453 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001454 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001455 if filename is None and (not type or type == 'file'):
1456 try:
1457 fp = self.open_local_file(url1)
1458 hdrs = fp.info()
1459 del fp
Georg Brandl13e89462008-07-01 19:56:00 +00001460 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001461 except IOError as msg:
1462 pass
1463 fp = self.open(url, data)
1464 headers = fp.info()
1465 if filename:
1466 tfp = open(filename, 'wb')
1467 else:
1468 import tempfile
Georg Brandl13e89462008-07-01 19:56:00 +00001469 garbage, path = splittype(url)
1470 garbage, path = splithost(path or "")
1471 path, garbage = splitquery(path or "")
1472 path, garbage = splitattr(path or "")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001473 suffix = os.path.splitext(path)[1]
1474 (fd, filename) = tempfile.mkstemp(suffix)
1475 self.__tempfiles.append(filename)
1476 tfp = os.fdopen(fd, 'wb')
1477 result = filename, headers
1478 if self.tempcache is not None:
1479 self.tempcache[url] = result
1480 bs = 1024*8
1481 size = -1
1482 read = 0
1483 blocknum = 0
1484 if reporthook:
1485 if "content-length" in headers:
1486 size = int(headers["Content-Length"])
1487 reporthook(blocknum, bs, size)
1488 while 1:
1489 block = fp.read(bs)
1490 if not block:
1491 break
1492 read += len(block)
1493 tfp.write(block)
1494 blocknum += 1
1495 if reporthook:
1496 reporthook(blocknum, bs, size)
1497 fp.close()
1498 tfp.close()
1499 del fp
1500 del tfp
1501
1502 # raise exception if actual size does not match content-length header
1503 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001504 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001505 "retrieval incomplete: got only %i out of %i bytes"
1506 % (read, size), result)
1507
1508 return result
1509
1510 # Each method named open_<type> knows how to open that type of URL
1511
1512 def _open_generic_http(self, connection_factory, url, data):
1513 """Make an HTTP connection using connection_class.
1514
1515 This is an internal method that should be called from
1516 open_http() or open_https().
1517
1518 Arguments:
1519 - connection_factory should take a host name and return an
1520 HTTPConnection instance.
1521 - url is the url to retrieval or a host, relative-path pair.
1522 - data is payload for a POST request or None.
1523 """
1524
1525 user_passwd = None
1526 proxy_passwd= None
1527 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001528 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001529 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001530 user_passwd, host = splituser(host)
1531 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001532 realhost = host
1533 else:
1534 host, selector = url
1535 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001536 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001537 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001538 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001539 url = rest
1540 user_passwd = None
1541 if urltype.lower() != 'http':
1542 realhost = None
1543 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001544 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001545 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001546 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001547 if user_passwd:
1548 selector = "%s://%s%s" % (urltype, realhost, rest)
1549 if proxy_bypass(realhost):
1550 host = realhost
1551
1552 #print "proxy via http:", host, selector
1553 if not host: raise IOError('http error', 'no host given')
1554
1555 if proxy_passwd:
1556 import base64
1557 proxy_auth = base64.b64encode(proxy_passwd).strip()
1558 else:
1559 proxy_auth = None
1560
1561 if user_passwd:
1562 import base64
1563 auth = base64.b64encode(user_passwd).strip()
1564 else:
1565 auth = None
1566 http_conn = connection_factory(host)
1567 # XXX We should fix urllib so that it works with HTTP/1.1.
1568 http_conn._http_vsn = 10
1569 http_conn._http_vsn_str = "HTTP/1.0"
1570
1571 headers = {}
1572 if proxy_auth:
1573 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1574 if auth:
1575 headers["Authorization"] = "Basic %s" % auth
1576 if realhost:
1577 headers["Host"] = realhost
1578 for header, value in self.addheaders:
1579 headers[header] = value
1580
1581 if data is not None:
1582 headers["Content-Type"] = "application/x-www-form-urlencoded"
1583 http_conn.request("POST", selector, data, headers)
1584 else:
1585 http_conn.request("GET", selector, headers=headers)
1586
1587 try:
1588 response = http_conn.getresponse()
1589 except http.client.BadStatusLine:
1590 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001591 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001592
1593 # According to RFC 2616, "2xx" code indicates that the client's
1594 # request was successfully received, understood, and accepted.
1595 if 200 <= response.status < 300:
Georg Brandl13e89462008-07-01 19:56:00 +00001596 return addinfourl(response.fp, response.msg, "http:" + url,
1597 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001598 else:
1599 return self.http_error(
1600 url, response.fp,
1601 response.status, response.reason, response.msg, data)
1602
1603 def open_http(self, url, data=None):
1604 """Use HTTP protocol."""
1605 return self._open_generic_http(http.client.HTTPConnection, url, data)
1606
1607 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1608 """Handle http errors.
1609
1610 Derived class can override this, or provide specific handlers
1611 named http_error_DDD where DDD is the 3-digit error code."""
1612 # First check if there's a specific handler for this error
1613 name = 'http_error_%d' % errcode
1614 if hasattr(self, name):
1615 method = getattr(self, name)
1616 if data is None:
1617 result = method(url, fp, errcode, errmsg, headers)
1618 else:
1619 result = method(url, fp, errcode, errmsg, headers, data)
1620 if result: return result
1621 return self.http_error_default(url, fp, errcode, errmsg, headers)
1622
1623 def http_error_default(self, url, fp, errcode, errmsg, headers):
1624 """Default error handler: close the connection and raise IOError."""
1625 void = fp.read()
1626 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001627 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001628
1629 if _have_ssl:
1630 def _https_connection(self, host):
1631 return http.client.HTTPSConnection(host,
1632 key_file=self.key_file,
1633 cert_file=self.cert_file)
1634
1635 def open_https(self, url, data=None):
1636 """Use HTTPS protocol."""
1637 return self._open_generic_http(self._https_connection, url, data)
1638
1639 def open_file(self, url):
1640 """Use local file or FTP depending on form of URL."""
1641 if not isinstance(url, str):
1642 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1643 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1644 return self.open_ftp(url)
1645 else:
1646 return self.open_local_file(url)
1647
1648 def open_local_file(self, url):
1649 """Use local file."""
1650 import mimetypes, email.utils
1651 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001652 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001653 localname = url2pathname(file)
1654 try:
1655 stats = os.stat(localname)
1656 except OSError as e:
1657 raise URLError(e.errno, e.strerror, e.filename)
1658 size = stats.st_size
1659 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1660 mtype = mimetypes.guess_type(url)[0]
1661 headers = email.message_from_string(
1662 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1663 (mtype or 'text/plain', size, modified))
1664 if not host:
1665 urlfile = file
1666 if file[:1] == '/':
1667 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001668 return addinfourl(open(localname, 'rb'), headers, urlfile)
1669 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001670 if (not port
1671 and socket.gethostbyname(host) in (localhost(), thishost())):
1672 urlfile = file
1673 if file[:1] == '/':
1674 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001675 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001676 raise URLError('local file error', 'not on local host')
1677
1678 def open_ftp(self, url):
1679 """Use FTP protocol."""
1680 if not isinstance(url, str):
1681 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1682 import mimetypes
1683 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001684 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001685 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001686 host, port = splitport(host)
1687 user, host = splituser(host)
1688 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001689 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001690 host = unquote(host)
1691 user = unquote(user or '')
1692 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001693 host = socket.gethostbyname(host)
1694 if not port:
1695 import ftplib
1696 port = ftplib.FTP_PORT
1697 else:
1698 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001699 path, attrs = splitattr(path)
1700 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001701 dirs = path.split('/')
1702 dirs, file = dirs[:-1], dirs[-1]
1703 if dirs and not dirs[0]: dirs = dirs[1:]
1704 if dirs and not dirs[0]: dirs[0] = '/'
1705 key = user, host, port, '/'.join(dirs)
1706 # XXX thread unsafe!
1707 if len(self.ftpcache) > MAXFTPCACHE:
1708 # Prune the cache, rather arbitrarily
1709 for k in self.ftpcache.keys():
1710 if k != key:
1711 v = self.ftpcache[k]
1712 del self.ftpcache[k]
1713 v.close()
1714 try:
1715 if not key in self.ftpcache:
1716 self.ftpcache[key] = \
1717 ftpwrapper(user, passwd, host, port, dirs)
1718 if not file: type = 'D'
1719 else: type = 'I'
1720 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001721 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001722 if attr.lower() == 'type' and \
1723 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1724 type = value.upper()
1725 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1726 mtype = mimetypes.guess_type("ftp:" + url)[0]
1727 headers = ""
1728 if mtype:
1729 headers += "Content-Type: %s\n" % mtype
1730 if retrlen is not None and retrlen >= 0:
1731 headers += "Content-Length: %d\n" % retrlen
1732 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001733 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001734 except ftperrors() as msg:
1735 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1736
1737 def open_data(self, url, data=None):
1738 """Use "data" URL."""
1739 if not isinstance(url, str):
1740 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1741 # ignore POSTed data
1742 #
1743 # syntax of data URLs:
1744 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1745 # mediatype := [ type "/" subtype ] *( ";" parameter )
1746 # data := *urlchar
1747 # parameter := attribute "=" value
1748 try:
1749 [type, data] = url.split(',', 1)
1750 except ValueError:
1751 raise IOError('data error', 'bad data URL')
1752 if not type:
1753 type = 'text/plain;charset=US-ASCII'
1754 semi = type.rfind(';')
1755 if semi >= 0 and '=' not in type[semi:]:
1756 encoding = type[semi+1:]
1757 type = type[:semi]
1758 else:
1759 encoding = ''
1760 msg = []
1761 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
1762 time.gmtime(time.time())))
1763 msg.append('Content-type: %s' % type)
1764 if encoding == 'base64':
1765 import base64
1766 data = base64.decodestring(data)
1767 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001768 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001769 msg.append('Content-Length: %d' % len(data))
1770 msg.append('')
1771 msg.append(data)
1772 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001773 headers = email.message_from_string(msg)
1774 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001775 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001776 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001777
1778
1779class FancyURLopener(URLopener):
1780 """Derived class with handlers for errors we can handle (perhaps)."""
1781
1782 def __init__(self, *args, **kwargs):
1783 URLopener.__init__(self, *args, **kwargs)
1784 self.auth_cache = {}
1785 self.tries = 0
1786 self.maxtries = 10
1787
1788 def http_error_default(self, url, fp, errcode, errmsg, headers):
1789 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001790 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001791
1792 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1793 """Error 302 -- relocated (temporarily)."""
1794 self.tries += 1
1795 if self.maxtries and self.tries >= self.maxtries:
1796 if hasattr(self, "http_error_500"):
1797 meth = self.http_error_500
1798 else:
1799 meth = self.http_error_default
1800 self.tries = 0
1801 return meth(url, fp, 500,
1802 "Internal Server Error: Redirect Recursion", headers)
1803 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1804 data)
1805 self.tries = 0
1806 return result
1807
1808 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1809 if 'location' in headers:
1810 newurl = headers['location']
1811 elif 'uri' in headers:
1812 newurl = headers['uri']
1813 else:
1814 return
1815 void = fp.read()
1816 fp.close()
1817 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001818 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001819 return self.open(newurl)
1820
1821 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1822 """Error 301 -- also relocated (permanently)."""
1823 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1824
1825 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1826 """Error 303 -- also relocated (essentially identical to 302)."""
1827 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1828
1829 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1830 """Error 307 -- relocated, but turn POST into error."""
1831 if data is None:
1832 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1833 else:
1834 return self.http_error_default(url, fp, errcode, errmsg, headers)
1835
1836 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
1837 """Error 401 -- authentication required.
1838 This function supports Basic authentication only."""
1839 if not 'www-authenticate' in headers:
1840 URLopener.http_error_default(self, url, fp,
1841 errcode, errmsg, headers)
1842 stuff = headers['www-authenticate']
1843 import re
1844 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1845 if not match:
1846 URLopener.http_error_default(self, url, fp,
1847 errcode, errmsg, headers)
1848 scheme, realm = match.groups()
1849 if scheme.lower() != 'basic':
1850 URLopener.http_error_default(self, url, fp,
1851 errcode, errmsg, headers)
1852 name = 'retry_' + self.type + '_basic_auth'
1853 if data is None:
1854 return getattr(self,name)(url, realm)
1855 else:
1856 return getattr(self,name)(url, realm, data)
1857
1858 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
1859 """Error 407 -- proxy authentication required.
1860 This function supports Basic authentication only."""
1861 if not 'proxy-authenticate' in headers:
1862 URLopener.http_error_default(self, url, fp,
1863 errcode, errmsg, headers)
1864 stuff = headers['proxy-authenticate']
1865 import re
1866 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1867 if not match:
1868 URLopener.http_error_default(self, url, fp,
1869 errcode, errmsg, headers)
1870 scheme, realm = match.groups()
1871 if scheme.lower() != 'basic':
1872 URLopener.http_error_default(self, url, fp,
1873 errcode, errmsg, headers)
1874 name = 'retry_proxy_' + self.type + '_basic_auth'
1875 if data is None:
1876 return getattr(self,name)(url, realm)
1877 else:
1878 return getattr(self,name)(url, realm, data)
1879
1880 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001881 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001882 newurl = 'http://' + host + selector
1883 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001884 urltype, proxyhost = splittype(proxy)
1885 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001886 i = proxyhost.find('@') + 1
1887 proxyhost = proxyhost[i:]
1888 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1889 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001890 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001891 quote(passwd, safe=''), proxyhost)
1892 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1893 if data is None:
1894 return self.open(newurl)
1895 else:
1896 return self.open(newurl, data)
1897
1898 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001899 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001900 newurl = 'https://' + host + selector
1901 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001902 urltype, proxyhost = splittype(proxy)
1903 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001904 i = proxyhost.find('@') + 1
1905 proxyhost = proxyhost[i:]
1906 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1907 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001908 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001909 quote(passwd, safe=''), proxyhost)
1910 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1911 if data is None:
1912 return self.open(newurl)
1913 else:
1914 return self.open(newurl, data)
1915
1916 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001917 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001918 i = host.find('@') + 1
1919 host = host[i:]
1920 user, passwd = self.get_user_passwd(host, realm, i)
1921 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001922 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001923 quote(passwd, safe=''), host)
1924 newurl = 'http://' + host + selector
1925 if data is None:
1926 return self.open(newurl)
1927 else:
1928 return self.open(newurl, data)
1929
1930 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001931 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001932 i = host.find('@') + 1
1933 host = host[i:]
1934 user, passwd = self.get_user_passwd(host, realm, i)
1935 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001936 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001937 quote(passwd, safe=''), host)
1938 newurl = 'https://' + host + selector
1939 if data is None:
1940 return self.open(newurl)
1941 else:
1942 return self.open(newurl, data)
1943
1944 def get_user_passwd(self, host, realm, clear_cache = 0):
1945 key = realm + '@' + host.lower()
1946 if key in self.auth_cache:
1947 if clear_cache:
1948 del self.auth_cache[key]
1949 else:
1950 return self.auth_cache[key]
1951 user, passwd = self.prompt_user_passwd(host, realm)
1952 if user or passwd: self.auth_cache[key] = (user, passwd)
1953 return user, passwd
1954
1955 def prompt_user_passwd(self, host, realm):
1956 """Override this in a GUI environment!"""
1957 import getpass
1958 try:
1959 user = input("Enter username for %s at %s: " % (realm, host))
1960 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
1961 (user, realm, host))
1962 return user, passwd
1963 except KeyboardInterrupt:
1964 print()
1965 return None, None
1966
1967
1968# Utility functions
1969
1970_localhost = None
1971def localhost():
1972 """Return the IP address of the magic hostname 'localhost'."""
1973 global _localhost
1974 if _localhost is None:
1975 _localhost = socket.gethostbyname('localhost')
1976 return _localhost
1977
1978_thishost = None
1979def thishost():
1980 """Return the IP address of the current host."""
1981 global _thishost
1982 if _thishost is None:
1983 _thishost = socket.gethostbyname(socket.gethostname())
1984 return _thishost
1985
1986_ftperrors = None
1987def ftperrors():
1988 """Return the set of errors raised by the FTP class."""
1989 global _ftperrors
1990 if _ftperrors is None:
1991 import ftplib
1992 _ftperrors = ftplib.all_errors
1993 return _ftperrors
1994
1995_noheaders = None
1996def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00001997 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001998 global _noheaders
1999 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002000 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002001 return _noheaders
2002
2003
2004# Utility classes
2005
2006class ftpwrapper:
2007 """Class used by open_ftp() for cache of open FTP connections."""
2008
2009 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2010 self.user = user
2011 self.passwd = passwd
2012 self.host = host
2013 self.port = port
2014 self.dirs = dirs
2015 self.timeout = timeout
2016 self.init()
2017
2018 def init(self):
2019 import ftplib
2020 self.busy = 0
2021 self.ftp = ftplib.FTP()
2022 self.ftp.connect(self.host, self.port, self.timeout)
2023 self.ftp.login(self.user, self.passwd)
2024 for dir in self.dirs:
2025 self.ftp.cwd(dir)
2026
2027 def retrfile(self, file, type):
2028 import ftplib
2029 self.endtransfer()
2030 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2031 else: cmd = 'TYPE ' + type; isdir = 0
2032 try:
2033 self.ftp.voidcmd(cmd)
2034 except ftplib.all_errors:
2035 self.init()
2036 self.ftp.voidcmd(cmd)
2037 conn = None
2038 if file and not isdir:
2039 # Try to retrieve as a file
2040 try:
2041 cmd = 'RETR ' + file
2042 conn = self.ftp.ntransfercmd(cmd)
2043 except ftplib.error_perm as reason:
2044 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002045 raise URLError('ftp error', reason).with_traceback(
2046 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002047 if not conn:
2048 # Set transfer mode to ASCII!
2049 self.ftp.voidcmd('TYPE A')
2050 # Try a directory listing. Verify that directory exists.
2051 if file:
2052 pwd = self.ftp.pwd()
2053 try:
2054 try:
2055 self.ftp.cwd(file)
2056 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002057 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002058 finally:
2059 self.ftp.cwd(pwd)
2060 cmd = 'LIST ' + file
2061 else:
2062 cmd = 'LIST'
2063 conn = self.ftp.ntransfercmd(cmd)
2064 self.busy = 1
2065 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002066 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002067 def endtransfer(self):
2068 if not self.busy:
2069 return
2070 self.busy = 0
2071 try:
2072 self.ftp.voidresp()
2073 except ftperrors():
2074 pass
2075
2076 def close(self):
2077 self.endtransfer()
2078 try:
2079 self.ftp.close()
2080 except ftperrors():
2081 pass
2082
2083# Proxy handling
2084def getproxies_environment():
2085 """Return a dictionary of scheme -> proxy server URL mappings.
2086
2087 Scan the environment for variables named <scheme>_proxy;
2088 this seems to be the standard convention. If you need a
2089 different way, you can pass a proxies dictionary to the
2090 [Fancy]URLopener constructor.
2091
2092 """
2093 proxies = {}
2094 for name, value in os.environ.items():
2095 name = name.lower()
2096 if name == 'no_proxy':
2097 # handled in proxy_bypass_environment
2098 continue
2099 if value and name[-6:] == '_proxy':
2100 proxies[name[:-6]] = value
2101 return proxies
2102
2103def proxy_bypass_environment(host):
2104 """Test if proxies should not be used for a particular host.
2105
2106 Checks the environment for a variable named no_proxy, which should
2107 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2108 """
2109 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2110 # '*' is special case for always bypass
2111 if no_proxy == '*':
2112 return 1
2113 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002114 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002115 # check if the host ends with any of the DNS suffixes
2116 for name in no_proxy.split(','):
2117 if name and (hostonly.endswith(name) or host.endswith(name)):
2118 return 1
2119 # otherwise, don't bypass
2120 return 0
2121
2122
2123if sys.platform == 'darwin':
2124 def getproxies_internetconfig():
2125 """Return a dictionary of scheme -> proxy server URL mappings.
2126
2127 By convention the mac uses Internet Config to store
2128 proxies. An HTTP proxy, for instance, is stored under
2129 the HttpProxy key.
2130
2131 """
2132 try:
2133 import ic
2134 except ImportError:
2135 return {}
2136
2137 try:
2138 config = ic.IC()
2139 except ic.error:
2140 return {}
2141 proxies = {}
2142 # HTTP:
2143 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
2144 try:
2145 value = config['HTTPProxyHost']
2146 except ic.error:
2147 pass
2148 else:
2149 proxies['http'] = 'http://%s' % value
2150 # FTP: XXX To be done.
2151 # Gopher: XXX To be done.
2152 return proxies
2153
2154 def proxy_bypass(host):
2155 if getproxies_environment():
2156 return proxy_bypass_environment(host)
2157 else:
2158 return 0
2159
2160 def getproxies():
2161 return getproxies_environment() or getproxies_internetconfig()
2162
2163elif os.name == 'nt':
2164 def getproxies_registry():
2165 """Return a dictionary of scheme -> proxy server URL mappings.
2166
2167 Win32 uses the registry to store proxies.
2168
2169 """
2170 proxies = {}
2171 try:
2172 import _winreg
2173 except ImportError:
2174 # Std module, so should be around - but you never know!
2175 return proxies
2176 try:
2177 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2178 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2179 proxyEnable = _winreg.QueryValueEx(internetSettings,
2180 'ProxyEnable')[0]
2181 if proxyEnable:
2182 # Returned as Unicode but problems if not converted to ASCII
2183 proxyServer = str(_winreg.QueryValueEx(internetSettings,
2184 'ProxyServer')[0])
2185 if '=' in proxyServer:
2186 # Per-protocol settings
2187 for p in proxyServer.split(';'):
2188 protocol, address = p.split('=', 1)
2189 # See if address has a type:// prefix
2190 import re
2191 if not re.match('^([^/:]+)://', address):
2192 address = '%s://%s' % (protocol, address)
2193 proxies[protocol] = address
2194 else:
2195 # Use one setting for all protocols
2196 if proxyServer[:5] == 'http:':
2197 proxies['http'] = proxyServer
2198 else:
2199 proxies['http'] = 'http://%s' % proxyServer
2200 proxies['ftp'] = 'ftp://%s' % proxyServer
2201 internetSettings.Close()
2202 except (WindowsError, ValueError, TypeError):
2203 # Either registry key not found etc, or the value in an
2204 # unexpected format.
2205 # proxies already set up to be empty so nothing to do
2206 pass
2207 return proxies
2208
2209 def getproxies():
2210 """Return a dictionary of scheme -> proxy server URL mappings.
2211
2212 Returns settings gathered from the environment, if specified,
2213 or the registry.
2214
2215 """
2216 return getproxies_environment() or getproxies_registry()
2217
2218 def proxy_bypass_registry(host):
2219 try:
2220 import _winreg
2221 import re
2222 except ImportError:
2223 # Std modules, so should be around - but you never know!
2224 return 0
2225 try:
2226 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2227 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2228 proxyEnable = _winreg.QueryValueEx(internetSettings,
2229 'ProxyEnable')[0]
2230 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
2231 'ProxyOverride')[0])
2232 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2233 except WindowsError:
2234 return 0
2235 if not proxyEnable or not proxyOverride:
2236 return 0
2237 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002238 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002239 host = [rawHost]
2240 try:
2241 addr = socket.gethostbyname(rawHost)
2242 if addr != rawHost:
2243 host.append(addr)
2244 except socket.error:
2245 pass
2246 try:
2247 fqdn = socket.getfqdn(rawHost)
2248 if fqdn != rawHost:
2249 host.append(fqdn)
2250 except socket.error:
2251 pass
2252 # make a check value list from the registry entry: replace the
2253 # '<local>' string by the localhost entry and the corresponding
2254 # canonical entry.
2255 proxyOverride = proxyOverride.split(';')
2256 i = 0
2257 while i < len(proxyOverride):
2258 if proxyOverride[i] == '<local>':
2259 proxyOverride[i:i+1] = ['localhost',
2260 '127.0.0.1',
2261 socket.gethostname(),
2262 socket.gethostbyname(
2263 socket.gethostname())]
2264 i += 1
2265 # print proxyOverride
2266 # now check if we match one of the registry values.
2267 for test in proxyOverride:
2268 test = test.replace(".", r"\.") # mask dots
2269 test = test.replace("*", r".*") # change glob sequence
2270 test = test.replace("?", r".") # change glob char
2271 for val in host:
2272 # print "%s <--> %s" %( test, val )
2273 if re.match(test, val, re.I):
2274 return 1
2275 return 0
2276
2277 def proxy_bypass(host):
2278 """Return a dictionary of scheme -> proxy server URL mappings.
2279
2280 Returns settings gathered from the environment, if specified,
2281 or the registry.
2282
2283 """
2284 if getproxies_environment():
2285 return proxy_bypass_environment(host)
2286 else:
2287 return proxy_bypass_registry(host)
2288
2289else:
2290 # By default use environment variables
2291 getproxies = getproxies_environment
2292 proxy_bypass = proxy_bypass_environment