blob: c789ffce63b9a2cb103d1c24b8b024aae5b19cb2 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001# Issues in merging urllib and urllib2:
2# 1. They both define a function named urlopen()
3
4"""An extensible library for opening URLs using a variety of protocols
5
6The simplest way to use this module is to call the urlopen function,
7which accepts a string containing a URL or a Request object (described
8below). It opens the URL and returns the results as file-like
9object; the returned object has some extra methods described below.
10
11The OpenerDirector manages a collection of Handler objects that do
12all the actual work. Each Handler implements a particular protocol or
13option. The OpenerDirector is a composite object that invokes the
14Handlers needed to open the requested URL. For example, the
15HTTPHandler performs HTTP GET and POST requests and deals with
16non-error returns. The HTTPRedirectHandler automatically deals with
17HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
18deals with digest authentication.
19
20urlopen(url, data=None) -- Basic usage is the same as original
21urllib. pass the url and optionally data to post to an HTTP URL, and
22get a file-like object back. One difference is that you can also pass
23a Request instance instead of URL. Raises a URLError (subclass of
24IOError); for HTTP errors, raises an HTTPError, which can also be
25treated as a valid response.
26
27build_opener -- Function that creates a new OpenerDirector instance.
28Will install the default handlers. Accepts one or more Handlers as
29arguments, either instances or Handler classes that it will
30instantiate. If one of the argument is a subclass of the default
31handler, the argument will be installed instead of the default.
32
33install_opener -- Installs a new opener as the default opener.
34
35objects of interest:
36OpenerDirector --
37
38Request -- An object that encapsulates the state of a request. The
39state can be as simple as the URL. It can also include extra HTTP
40headers, e.g. a User-Agent.
41
42BaseHandler --
43
44internals:
45BaseHandler and parent
46_call_chain conventions
47
48Example usage:
49
Georg Brandl029986a2008-06-23 11:44:14 +000050import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051
52# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000053authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000054authinfo.add_password(realm='PDQ Application',
55 uri='https://mahler:8092/site-updates.py',
56 user='klem',
57 passwd='geheim$parole')
58
Georg Brandl029986a2008-06-23 11:44:14 +000059proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000060
61# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000062opener = urllib.request.build_opener(proxy_support, authinfo,
63 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064
65# install it
Georg Brandl029986a2008-06-23 11:44:14 +000066urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000067
Georg Brandl029986a2008-06-23 11:44:14 +000068f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000069"""
70
71# XXX issues:
72# If an authentication error handler that tries to perform
73# authentication for some reason but fails, how should the error be
74# signalled? The client needs to know the HTTP error code. But if
75# the handler knows that the problem was, e.g., that it didn't know
76# that hash algo that requested in the challenge, it would be good to
77# pass that information along to the client, too.
78# ftp errors aren't handled cleanly
79# check digest against correct (i.e. non-apache) implementation
80
81# Possible extensions:
82# complex proxies XXX not sure what exactly was meant by this
83# abstract factory for opener
84
85import base64
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097import bisect
98
Georg Brandl13e89462008-07-01 19:56:00 +000099from urllib.error import URLError, HTTPError, ContentTooShortError
100from urllib.parse import (
101 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
102 splittype, splithost, splitport, splituser, splitpasswd,
Facundo Batistaf24802c2008-08-17 03:36:03 +0000103 splitattr, splitquery, splitvalue, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000104from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106# check for SSL
107try:
108 import ssl
109except:
110 _have_ssl = False
111else:
112 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114# used in User-Agent header sent
115__version__ = sys.version[:3]
116
117_opener = None
118def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
119 global _opener
120 if _opener is None:
121 _opener = build_opener()
122 return _opener.open(url, data, timeout)
123
124def install_opener(opener):
125 global _opener
126 _opener = opener
127
128# TODO(jhylton): Make this work with the same global opener.
129_urlopener = None
130def urlretrieve(url, filename=None, reporthook=None, data=None):
131 global _urlopener
132 if not _urlopener:
133 _urlopener = FancyURLopener()
134 return _urlopener.retrieve(url, filename, reporthook, data)
135
136def urlcleanup():
137 if _urlopener:
138 _urlopener.cleanup()
139 global _opener
140 if _opener:
141 _opener = None
142
143# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000144_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000145def request_host(request):
146 """Return request-host, as defined by RFC 2965.
147
148 Variation from RFC: returned value is lowercased, for convenient
149 comparison.
150
151 """
152 url = request.get_full_url()
Georg Brandl13e89462008-07-01 19:56:00 +0000153 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000154 if host == "":
155 host = request.get_header("Host", "")
156
157 # remove port, if present
158 host = _cut_port_re.sub("", host, 1)
159 return host.lower()
160
161class Request:
162
163 def __init__(self, url, data=None, headers={},
164 origin_req_host=None, unverifiable=False):
165 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Georg Brandl13e89462008-07-01 19:56:00 +0000166 self.__original = unwrap(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000167 self.type = None
168 # self.__r_type is what's left after doing the splittype
169 self.host = None
170 self.port = None
171 self.data = data
172 self.headers = {}
173 for key, value in headers.items():
174 self.add_header(key, value)
175 self.unredirected_hdrs = {}
176 if origin_req_host is None:
177 origin_req_host = request_host(self)
178 self.origin_req_host = origin_req_host
179 self.unverifiable = unverifiable
180
181 def __getattr__(self, attr):
182 # XXX this is a fallback mechanism to guard against these
183 # methods getting called in a non-standard order. this may be
184 # too complicated and/or unnecessary.
185 # XXX should the __r_XXX attributes be public?
186 if attr[:12] == '_Request__r_':
187 name = attr[12:]
188 if hasattr(Request, 'get_' + name):
189 getattr(self, 'get_' + name)()
190 return getattr(self, attr)
191 raise AttributeError(attr)
192
193 def get_method(self):
194 if self.has_data():
195 return "POST"
196 else:
197 return "GET"
198
199 # XXX these helper methods are lame
200
201 def add_data(self, data):
202 self.data = data
203
204 def has_data(self):
205 return self.data is not None
206
207 def get_data(self):
208 return self.data
209
210 def get_full_url(self):
211 return self.__original
212
213 def get_type(self):
214 if self.type is None:
Georg Brandl13e89462008-07-01 19:56:00 +0000215 self.type, self.__r_type = splittype(self.__original)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000216 if self.type is None:
217 raise ValueError("unknown url type: %s" % self.__original)
218 return self.type
219
220 def get_host(self):
221 if self.host is None:
Georg Brandl13e89462008-07-01 19:56:00 +0000222 self.host, self.__r_host = splithost(self.__r_type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000223 if self.host:
Georg Brandl13e89462008-07-01 19:56:00 +0000224 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000225 return self.host
226
227 def get_selector(self):
228 return self.__r_host
229
230 def set_proxy(self, host, type):
231 self.host, self.type = host, type
232 self.__r_host = self.__original
233
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000234 def has_proxy(self):
235 return self.__r_host == self.__original
236
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000237 def get_origin_req_host(self):
238 return self.origin_req_host
239
240 def is_unverifiable(self):
241 return self.unverifiable
242
243 def add_header(self, key, val):
244 # useful for something like authentication
245 self.headers[key.capitalize()] = val
246
247 def add_unredirected_header(self, key, val):
248 # will not be added to a redirected request
249 self.unredirected_hdrs[key.capitalize()] = val
250
251 def has_header(self, header_name):
252 return (header_name in self.headers or
253 header_name in self.unredirected_hdrs)
254
255 def get_header(self, header_name, default=None):
256 return self.headers.get(
257 header_name,
258 self.unredirected_hdrs.get(header_name, default))
259
260 def header_items(self):
261 hdrs = self.unredirected_hdrs.copy()
262 hdrs.update(self.headers)
263 return list(hdrs.items())
264
265class OpenerDirector:
266 def __init__(self):
267 client_version = "Python-urllib/%s" % __version__
268 self.addheaders = [('User-agent', client_version)]
269 # manage the individual handlers
270 self.handlers = []
271 self.handle_open = {}
272 self.handle_error = {}
273 self.process_response = {}
274 self.process_request = {}
275
276 def add_handler(self, handler):
277 if not hasattr(handler, "add_parent"):
278 raise TypeError("expected BaseHandler instance, got %r" %
279 type(handler))
280
281 added = False
282 for meth in dir(handler):
283 if meth in ["redirect_request", "do_open", "proxy_open"]:
284 # oops, coincidental match
285 continue
286
287 i = meth.find("_")
288 protocol = meth[:i]
289 condition = meth[i+1:]
290
291 if condition.startswith("error"):
292 j = condition.find("_") + i + 1
293 kind = meth[j+1:]
294 try:
295 kind = int(kind)
296 except ValueError:
297 pass
298 lookup = self.handle_error.get(protocol, {})
299 self.handle_error[protocol] = lookup
300 elif condition == "open":
301 kind = protocol
302 lookup = self.handle_open
303 elif condition == "response":
304 kind = protocol
305 lookup = self.process_response
306 elif condition == "request":
307 kind = protocol
308 lookup = self.process_request
309 else:
310 continue
311
312 handlers = lookup.setdefault(kind, [])
313 if handlers:
314 bisect.insort(handlers, handler)
315 else:
316 handlers.append(handler)
317 added = True
318
319 if added:
320 # the handlers must work in an specific order, the order
321 # is specified in a Handler attribute
322 bisect.insort(self.handlers, handler)
323 handler.add_parent(self)
324
325 def close(self):
326 # Only exists for backwards compatibility.
327 pass
328
329 def _call_chain(self, chain, kind, meth_name, *args):
330 # Handlers raise an exception if no one else should try to handle
331 # the request, or return None if they can't but another handler
332 # could. Otherwise, they return the response.
333 handlers = chain.get(kind, ())
334 for handler in handlers:
335 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000336 result = func(*args)
337 if result is not None:
338 return result
339
340 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
341 # accept a URL or a Request object
342 if isinstance(fullurl, str):
343 req = Request(fullurl, data)
344 else:
345 req = fullurl
346 if data is not None:
347 req.add_data(data)
348
349 req.timeout = timeout
350 protocol = req.get_type()
351
352 # pre-process request
353 meth_name = protocol+"_request"
354 for processor in self.process_request.get(protocol, []):
355 meth = getattr(processor, meth_name)
356 req = meth(req)
357
358 response = self._open(req, data)
359
360 # post-process response
361 meth_name = protocol+"_response"
362 for processor in self.process_response.get(protocol, []):
363 meth = getattr(processor, meth_name)
364 response = meth(req, response)
365
366 return response
367
368 def _open(self, req, data=None):
369 result = self._call_chain(self.handle_open, 'default',
370 'default_open', req)
371 if result:
372 return result
373
374 protocol = req.get_type()
375 result = self._call_chain(self.handle_open, protocol, protocol +
376 '_open', req)
377 if result:
378 return result
379
380 return self._call_chain(self.handle_open, 'unknown',
381 'unknown_open', req)
382
383 def error(self, proto, *args):
384 if proto in ('http', 'https'):
385 # XXX http[s] protocols are special-cased
386 dict = self.handle_error['http'] # https is not different than http
387 proto = args[2] # YUCK!
388 meth_name = 'http_error_%s' % proto
389 http_err = 1
390 orig_args = args
391 else:
392 dict = self.handle_error
393 meth_name = proto + '_error'
394 http_err = 0
395 args = (dict, proto, meth_name) + args
396 result = self._call_chain(*args)
397 if result:
398 return result
399
400 if http_err:
401 args = (dict, 'default', 'http_error_default') + orig_args
402 return self._call_chain(*args)
403
404# XXX probably also want an abstract factory that knows when it makes
405# sense to skip a superclass in favor of a subclass and when it might
406# make sense to include both
407
408def build_opener(*handlers):
409 """Create an opener object from a list of handlers.
410
411 The opener will use several default handlers, including support
412 for HTTP and FTP.
413
414 If any of the handlers passed as arguments are subclasses of the
415 default handlers, the default handlers will not be used.
416 """
417 def isclass(obj):
418 return isinstance(obj, type) or hasattr(obj, "__bases__")
419
420 opener = OpenerDirector()
421 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
422 HTTPDefaultErrorHandler, HTTPRedirectHandler,
423 FTPHandler, FileHandler, HTTPErrorProcessor]
424 if hasattr(http.client, "HTTPSConnection"):
425 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000426 skip = set()
427 for klass in default_classes:
428 for check in handlers:
429 if isclass(check):
430 if issubclass(check, klass):
431 skip.add(klass)
432 elif isinstance(check, klass):
433 skip.add(klass)
434 for klass in skip:
435 default_classes.remove(klass)
436
437 for klass in default_classes:
438 opener.add_handler(klass())
439
440 for h in handlers:
441 if isclass(h):
442 h = h()
443 opener.add_handler(h)
444 return opener
445
446class BaseHandler:
447 handler_order = 500
448
449 def add_parent(self, parent):
450 self.parent = parent
451
452 def close(self):
453 # Only exists for backwards compatibility
454 pass
455
456 def __lt__(self, other):
457 if not hasattr(other, "handler_order"):
458 # Try to preserve the old behavior of having custom classes
459 # inserted after default ones (works only for custom user
460 # classes which are not aware of handler_order).
461 return True
462 return self.handler_order < other.handler_order
463
464
465class HTTPErrorProcessor(BaseHandler):
466 """Process HTTP error responses."""
467 handler_order = 1000 # after all other processing
468
469 def http_response(self, request, response):
470 code, msg, hdrs = response.code, response.msg, response.info()
471
472 # According to RFC 2616, "2xx" code indicates that the client's
473 # request was successfully received, understood, and accepted.
474 if not (200 <= code < 300):
475 response = self.parent.error(
476 'http', request, response, code, msg, hdrs)
477
478 return response
479
480 https_response = http_response
481
482class HTTPDefaultErrorHandler(BaseHandler):
483 def http_error_default(self, req, fp, code, msg, hdrs):
Georg Brandl13e89462008-07-01 19:56:00 +0000484 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000485
486class HTTPRedirectHandler(BaseHandler):
487 # maximum number of redirections to any single URL
488 # this is needed because of the state that cookies introduce
489 max_repeats = 4
490 # maximum total number of redirections (regardless of URL) before
491 # assuming we're in a loop
492 max_redirections = 10
493
494 def redirect_request(self, req, fp, code, msg, headers, newurl):
495 """Return a Request or None in response to a redirect.
496
497 This is called by the http_error_30x methods when a
498 redirection response is received. If a redirection should
499 take place, return a new Request to allow http_error_30x to
500 perform the redirect. Otherwise, raise HTTPError if no-one
501 else should try to handle this url. Return None if you can't
502 but another Handler might.
503 """
504 m = req.get_method()
505 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
506 or code in (301, 302, 303) and m == "POST")):
Georg Brandl13e89462008-07-01 19:56:00 +0000507 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000508
509 # Strictly (according to RFC 2616), 301 or 302 in response to
510 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000511 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000512 # essentially all clients do redirect in this case, so we do
513 # the same.
514 # be conciliant with URIs containing a space
515 newurl = newurl.replace(' ', '%20')
516 CONTENT_HEADERS = ("content-length", "content-type")
517 newheaders = dict((k, v) for k, v in req.headers.items()
518 if k.lower() not in CONTENT_HEADERS)
519 return Request(newurl,
520 headers=newheaders,
521 origin_req_host=req.get_origin_req_host(),
522 unverifiable=True)
523
524 # Implementation note: To avoid the server sending us into an
525 # infinite loop, the request object needs to track what URLs we
526 # have already seen. Do this by adding a handler-specific
527 # attribute to the Request object.
528 def http_error_302(self, req, fp, code, msg, headers):
529 # Some servers (incorrectly) return multiple Location headers
530 # (so probably same goes for URI). Use first header.
531 if "location" in headers:
532 newurl = headers["location"]
533 elif "uri" in headers:
534 newurl = headers["uri"]
535 else:
536 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000537
538 # fix a possible malformed URL
539 urlparts = urlparse(newurl)
540 if not urlparts.path:
541 urlparts = list(urlparts)
542 urlparts[2] = "/"
543 newurl = urlunparse(urlparts)
544
Georg Brandl13e89462008-07-01 19:56:00 +0000545 newurl = urljoin(req.get_full_url(), newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000546
547 # XXX Probably want to forget about the state of the current
548 # request, although that might interact poorly with other
549 # handlers that also use handler-specific request attributes
550 new = self.redirect_request(req, fp, code, msg, headers, newurl)
551 if new is None:
552 return
553
554 # loop detection
555 # .redirect_dict has a key url if url was previously visited.
556 if hasattr(req, 'redirect_dict'):
557 visited = new.redirect_dict = req.redirect_dict
558 if (visited.get(newurl, 0) >= self.max_repeats or
559 len(visited) >= self.max_redirections):
Georg Brandl13e89462008-07-01 19:56:00 +0000560 raise HTTPError(req.get_full_url(), code,
561 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000562 else:
563 visited = new.redirect_dict = req.redirect_dict = {}
564 visited[newurl] = visited.get(newurl, 0) + 1
565
566 # Don't close the fp until we are sure that we won't use it
567 # with HTTPError.
568 fp.read()
569 fp.close()
570
571 return self.parent.open(new)
572
573 http_error_301 = http_error_303 = http_error_307 = http_error_302
574
575 inf_msg = "The HTTP server returned a redirect error that would " \
576 "lead to an infinite loop.\n" \
577 "The last 30x error message was:\n"
578
579
580def _parse_proxy(proxy):
581 """Return (scheme, user, password, host/port) given a URL or an authority.
582
583 If a URL is supplied, it must have an authority (host:port) component.
584 According to RFC 3986, having an authority component means the URL must
585 have two slashes after the scheme:
586
587 >>> _parse_proxy('file:/ftp.example.com/')
588 Traceback (most recent call last):
589 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
590
591 The first three items of the returned tuple may be None.
592
593 Examples of authority parsing:
594
595 >>> _parse_proxy('proxy.example.com')
596 (None, None, None, 'proxy.example.com')
597 >>> _parse_proxy('proxy.example.com:3128')
598 (None, None, None, 'proxy.example.com:3128')
599
600 The authority component may optionally include userinfo (assumed to be
601 username:password):
602
603 >>> _parse_proxy('joe:password@proxy.example.com')
604 (None, 'joe', 'password', 'proxy.example.com')
605 >>> _parse_proxy('joe:password@proxy.example.com:3128')
606 (None, 'joe', 'password', 'proxy.example.com:3128')
607
608 Same examples, but with URLs instead:
609
610 >>> _parse_proxy('http://proxy.example.com/')
611 ('http', None, None, 'proxy.example.com')
612 >>> _parse_proxy('http://proxy.example.com:3128/')
613 ('http', None, None, 'proxy.example.com:3128')
614 >>> _parse_proxy('http://joe:password@proxy.example.com/')
615 ('http', 'joe', 'password', 'proxy.example.com')
616 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
617 ('http', 'joe', 'password', 'proxy.example.com:3128')
618
619 Everything after the authority is ignored:
620
621 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
622 ('ftp', 'joe', 'password', 'proxy.example.com')
623
624 Test for no trailing '/' case:
625
626 >>> _parse_proxy('http://joe:password@proxy.example.com')
627 ('http', 'joe', 'password', 'proxy.example.com')
628
629 """
Georg Brandl13e89462008-07-01 19:56:00 +0000630 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000631 if not r_scheme.startswith("/"):
632 # authority
633 scheme = None
634 authority = proxy
635 else:
636 # URL
637 if not r_scheme.startswith("//"):
638 raise ValueError("proxy URL with no authority: %r" % proxy)
639 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
640 # and 3.3.), path is empty or starts with '/'
641 end = r_scheme.find("/", 2)
642 if end == -1:
643 end = None
644 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000645 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000646 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000647 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000648 else:
649 user = password = None
650 return scheme, user, password, hostport
651
652class ProxyHandler(BaseHandler):
653 # Proxies must be in front
654 handler_order = 100
655
656 def __init__(self, proxies=None):
657 if proxies is None:
658 proxies = getproxies()
659 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
660 self.proxies = proxies
661 for type, url in proxies.items():
662 setattr(self, '%s_open' % type,
663 lambda r, proxy=url, type=type, meth=self.proxy_open: \
664 meth(r, proxy, type))
665
666 def proxy_open(self, req, proxy, type):
667 orig_type = req.get_type()
668 proxy_type, user, password, hostport = _parse_proxy(proxy)
669 if proxy_type is None:
670 proxy_type = orig_type
671 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000672 user_pass = '%s:%s' % (unquote(user),
673 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000674 creds = base64.b64encode(user_pass.encode()).decode("ascii")
675 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000676 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000677 req.set_proxy(hostport, proxy_type)
678 if orig_type == proxy_type:
679 # let other handlers take care of it
680 return None
681 else:
682 # need to start over, because the other handlers don't
683 # grok the proxy's URL type
684 # e.g. if we have a constructor arg proxies like so:
685 # {'http': 'ftp://proxy.example.com'}, we may end up turning
686 # a request for http://acme.example.com/a into one for
687 # ftp://proxy.example.com/a
688 return self.parent.open(req)
689
690class HTTPPasswordMgr:
691
692 def __init__(self):
693 self.passwd = {}
694
695 def add_password(self, realm, uri, user, passwd):
696 # uri could be a single URI or a sequence
697 if isinstance(uri, str):
698 uri = [uri]
699 if not realm in self.passwd:
700 self.passwd[realm] = {}
701 for default_port in True, False:
702 reduced_uri = tuple(
703 [self.reduce_uri(u, default_port) for u in uri])
704 self.passwd[realm][reduced_uri] = (user, passwd)
705
706 def find_user_password(self, realm, authuri):
707 domains = self.passwd.get(realm, {})
708 for default_port in True, False:
709 reduced_authuri = self.reduce_uri(authuri, default_port)
710 for uris, authinfo in domains.items():
711 for uri in uris:
712 if self.is_suburi(uri, reduced_authuri):
713 return authinfo
714 return None, None
715
716 def reduce_uri(self, uri, default_port=True):
717 """Accept authority or URI and extract only the authority and path."""
718 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000719 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000720 if parts[1]:
721 # URI
722 scheme = parts[0]
723 authority = parts[1]
724 path = parts[2] or '/'
725 else:
726 # host or host:port
727 scheme = None
728 authority = uri
729 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000730 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000731 if default_port and port is None and scheme is not None:
732 dport = {"http": 80,
733 "https": 443,
734 }.get(scheme)
735 if dport is not None:
736 authority = "%s:%d" % (host, dport)
737 return authority, path
738
739 def is_suburi(self, base, test):
740 """Check if test is below base in a URI tree
741
742 Both args must be URIs in reduced form.
743 """
744 if base == test:
745 return True
746 if base[0] != test[0]:
747 return False
748 common = posixpath.commonprefix((base[1], test[1]))
749 if len(common) == len(base[1]):
750 return True
751 return False
752
753
754class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
755
756 def find_user_password(self, realm, authuri):
757 user, password = HTTPPasswordMgr.find_user_password(self, realm,
758 authuri)
759 if user is not None:
760 return user, password
761 return HTTPPasswordMgr.find_user_password(self, None, authuri)
762
763
764class AbstractBasicAuthHandler:
765
766 # XXX this allows for multiple auth-schemes, but will stupidly pick
767 # the last one with a realm specified.
768
769 # allow for double- and single-quoted realm values
770 # (single quotes are a violation of the RFC, but appear in the wild)
771 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
772 'realm=(["\'])(.*?)\\2', re.I)
773
774 # XXX could pre-emptively send auth info already accepted (RFC 2617,
775 # end of section 2, and section 1.2 immediately after "credentials"
776 # production).
777
778 def __init__(self, password_mgr=None):
779 if password_mgr is None:
780 password_mgr = HTTPPasswordMgr()
781 self.passwd = password_mgr
782 self.add_password = self.passwd.add_password
783
784 def http_error_auth_reqed(self, authreq, host, req, headers):
785 # host may be an authority (without userinfo) or a URL with an
786 # authority
787 # XXX could be multiple headers
788 authreq = headers.get(authreq, None)
789 if authreq:
790 mo = AbstractBasicAuthHandler.rx.search(authreq)
791 if mo:
792 scheme, quote, realm = mo.groups()
793 if scheme.lower() == 'basic':
794 return self.retry_http_basic_auth(host, req, realm)
795
796 def retry_http_basic_auth(self, host, req, realm):
797 user, pw = self.passwd.find_user_password(realm, host)
798 if pw is not None:
799 raw = "%s:%s" % (user, pw)
800 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
801 if req.headers.get(self.auth_header, None) == auth:
802 return None
803 req.add_header(self.auth_header, auth)
804 return self.parent.open(req)
805 else:
806 return None
807
808
809class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
810
811 auth_header = 'Authorization'
812
813 def http_error_401(self, req, fp, code, msg, headers):
814 url = req.get_full_url()
815 return self.http_error_auth_reqed('www-authenticate',
816 url, req, headers)
817
818
819class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
820
821 auth_header = 'Proxy-authorization'
822
823 def http_error_407(self, req, fp, code, msg, headers):
824 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000825 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000826 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
827 # userinfo.
828 authority = req.get_host()
829 return self.http_error_auth_reqed('proxy-authenticate',
830 authority, req, headers)
831
832
833def randombytes(n):
834 """Return n random bytes."""
835 return os.urandom(n)
836
837class AbstractDigestAuthHandler:
838 # Digest authentication is specified in RFC 2617.
839
840 # XXX The client does not inspect the Authentication-Info header
841 # in a successful response.
842
843 # XXX It should be possible to test this implementation against
844 # a mock server that just generates a static set of challenges.
845
846 # XXX qop="auth-int" supports is shaky
847
848 def __init__(self, passwd=None):
849 if passwd is None:
850 passwd = HTTPPasswordMgr()
851 self.passwd = passwd
852 self.add_password = self.passwd.add_password
853 self.retried = 0
854 self.nonce_count = 0
855
856 def reset_retry_count(self):
857 self.retried = 0
858
859 def http_error_auth_reqed(self, auth_header, host, req, headers):
860 authreq = headers.get(auth_header, None)
861 if self.retried > 5:
862 # Don't fail endlessly - if we failed once, we'll probably
863 # fail a second time. Hm. Unless the Password Manager is
864 # prompting for the information. Crap. This isn't great
865 # but it's better than the current 'repeat until recursion
866 # depth exceeded' approach <wink>
Georg Brandl13e89462008-07-01 19:56:00 +0000867 raise HTTPError(req.get_full_url(), 401, "digest auth failed",
868 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000869 else:
870 self.retried += 1
871 if authreq:
872 scheme = authreq.split()[0]
873 if scheme.lower() == 'digest':
874 return self.retry_http_digest_auth(req, authreq)
875
876 def retry_http_digest_auth(self, req, auth):
877 token, challenge = auth.split(' ', 1)
878 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
879 auth = self.get_authorization(req, chal)
880 if auth:
881 auth_val = 'Digest %s' % auth
882 if req.headers.get(self.auth_header, None) == auth_val:
883 return None
884 req.add_unredirected_header(self.auth_header, auth_val)
885 resp = self.parent.open(req)
886 return resp
887
888 def get_cnonce(self, nonce):
889 # The cnonce-value is an opaque
890 # quoted string value provided by the client and used by both client
891 # and server to avoid chosen plaintext attacks, to provide mutual
892 # authentication, and to provide some message integrity protection.
893 # This isn't a fabulous effort, but it's probably Good Enough.
894 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
895 b = s.encode("ascii") + randombytes(8)
896 dig = hashlib.sha1(b).hexdigest()
897 return dig[:16]
898
899 def get_authorization(self, req, chal):
900 try:
901 realm = chal['realm']
902 nonce = chal['nonce']
903 qop = chal.get('qop')
904 algorithm = chal.get('algorithm', 'MD5')
905 # mod_digest doesn't send an opaque, even though it isn't
906 # supposed to be optional
907 opaque = chal.get('opaque', None)
908 except KeyError:
909 return None
910
911 H, KD = self.get_algorithm_impls(algorithm)
912 if H is None:
913 return None
914
915 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
916 if user is None:
917 return None
918
919 # XXX not implemented yet
920 if req.has_data():
921 entdig = self.get_entity_digest(req.get_data(), chal)
922 else:
923 entdig = None
924
925 A1 = "%s:%s:%s" % (user, realm, pw)
926 A2 = "%s:%s" % (req.get_method(),
927 # XXX selector: what about proxies and full urls
928 req.get_selector())
929 if qop == 'auth':
930 self.nonce_count += 1
931 ncvalue = '%08x' % self.nonce_count
932 cnonce = self.get_cnonce(nonce)
933 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
934 respdig = KD(H(A1), noncebit)
935 elif qop is None:
936 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
937 else:
938 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000939 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000940
941 # XXX should the partial digests be encoded too?
942
943 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
944 'response="%s"' % (user, realm, nonce, req.get_selector(),
945 respdig)
946 if opaque:
947 base += ', opaque="%s"' % opaque
948 if entdig:
949 base += ', digest="%s"' % entdig
950 base += ', algorithm="%s"' % algorithm
951 if qop:
952 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
953 return base
954
955 def get_algorithm_impls(self, algorithm):
956 # lambdas assume digest modules are imported at the top level
957 if algorithm == 'MD5':
958 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
959 elif algorithm == 'SHA':
960 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
961 # XXX MD5-sess
962 KD = lambda s, d: H("%s:%s" % (s, d))
963 return H, KD
964
965 def get_entity_digest(self, data, chal):
966 # XXX not implemented yet
967 return None
968
969
970class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
971 """An authentication protocol defined by RFC 2069
972
973 Digest authentication improves on basic authentication because it
974 does not transmit passwords in the clear.
975 """
976
977 auth_header = 'Authorization'
978 handler_order = 490 # before Basic auth
979
980 def http_error_401(self, req, fp, code, msg, headers):
Georg Brandl13e89462008-07-01 19:56:00 +0000981 host = urlparse(req.get_full_url())[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000982 retry = self.http_error_auth_reqed('www-authenticate',
983 host, req, headers)
984 self.reset_retry_count()
985 return retry
986
987
988class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
989
990 auth_header = 'Proxy-Authorization'
991 handler_order = 490 # before Basic auth
992
993 def http_error_407(self, req, fp, code, msg, headers):
994 host = req.get_host()
995 retry = self.http_error_auth_reqed('proxy-authenticate',
996 host, req, headers)
997 self.reset_retry_count()
998 return retry
999
1000class AbstractHTTPHandler(BaseHandler):
1001
1002 def __init__(self, debuglevel=0):
1003 self._debuglevel = debuglevel
1004
1005 def set_http_debuglevel(self, level):
1006 self._debuglevel = level
1007
1008 def do_request_(self, request):
1009 host = request.get_host()
1010 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001011 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001012
1013 if request.has_data(): # POST
1014 data = request.get_data()
1015 if not request.has_header('Content-type'):
1016 request.add_unredirected_header(
1017 'Content-type',
1018 'application/x-www-form-urlencoded')
1019 if not request.has_header('Content-length'):
1020 request.add_unredirected_header(
1021 'Content-length', '%d' % len(data))
1022
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001023 sel_host = host
1024 if request.has_proxy():
1025 scheme, sel = splittype(request.get_selector())
1026 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001027 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001028 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001029 for name, value in self.parent.addheaders:
1030 name = name.capitalize()
1031 if not request.has_header(name):
1032 request.add_unredirected_header(name, value)
1033
1034 return request
1035
1036 def do_open(self, http_class, req):
1037 """Return an addinfourl object for the request, using http_class.
1038
1039 http_class must implement the HTTPConnection API from http.client.
1040 The addinfourl return value is a file-like object. It also
1041 has methods and attributes including:
Georg Brandl13e89462008-07-01 19:56:00 +00001042 - info(): return a email Message object for the headers
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001043 - geturl(): return the original request URL
1044 - code: HTTP status code
1045 """
1046 host = req.get_host()
1047 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001048 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001049
1050 h = http_class(host, timeout=req.timeout) # will parse host:port
1051 headers = dict(req.headers)
1052 headers.update(req.unredirected_hdrs)
1053
1054 # TODO(jhylton): Should this be redesigned to handle
1055 # persistent connections?
1056
1057 # We want to make an HTTP/1.1 request, but the addinfourl
1058 # class isn't prepared to deal with a persistent connection.
1059 # It will try to read all remaining data from the socket,
1060 # which will block while the server waits for the next request.
1061 # So make sure the connection gets closed after the (only)
1062 # request.
1063 headers["Connection"] = "close"
1064 headers = dict(
1065 (name.title(), val) for name, val in headers.items())
1066 try:
1067 h.request(req.get_method(), req.get_selector(), req.data, headers)
1068 r = h.getresponse()
1069 except socket.error as err: # XXX what error?
Georg Brandl13e89462008-07-01 19:56:00 +00001070 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001071
Antoine Pitroub353c122009-02-11 00:39:14 +00001072## resp = addinfourl(r.fp, r.msg, req.get_full_url())
1073 resp = addinfourl(r, r.msg, req.get_full_url())
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001074 resp.code = r.status
1075 resp.msg = r.reason
1076 return resp
1077
1078
1079class HTTPHandler(AbstractHTTPHandler):
1080
1081 def http_open(self, req):
1082 return self.do_open(http.client.HTTPConnection, req)
1083
1084 http_request = AbstractHTTPHandler.do_request_
1085
1086if hasattr(http.client, 'HTTPSConnection'):
1087 class HTTPSHandler(AbstractHTTPHandler):
1088
1089 def https_open(self, req):
1090 return self.do_open(http.client.HTTPSConnection, req)
1091
1092 https_request = AbstractHTTPHandler.do_request_
1093
1094class HTTPCookieProcessor(BaseHandler):
1095 def __init__(self, cookiejar=None):
1096 import http.cookiejar
1097 if cookiejar is None:
1098 cookiejar = http.cookiejar.CookieJar()
1099 self.cookiejar = cookiejar
1100
1101 def http_request(self, request):
1102 self.cookiejar.add_cookie_header(request)
1103 return request
1104
1105 def http_response(self, request, response):
1106 self.cookiejar.extract_cookies(response, request)
1107 return response
1108
1109 https_request = http_request
1110 https_response = http_response
1111
1112class UnknownHandler(BaseHandler):
1113 def unknown_open(self, req):
1114 type = req.get_type()
Georg Brandl13e89462008-07-01 19:56:00 +00001115 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001116
1117def parse_keqv_list(l):
1118 """Parse list of key=value strings where keys are not duplicated."""
1119 parsed = {}
1120 for elt in l:
1121 k, v = elt.split('=', 1)
1122 if v[0] == '"' and v[-1] == '"':
1123 v = v[1:-1]
1124 parsed[k] = v
1125 return parsed
1126
1127def parse_http_list(s):
1128 """Parse lists as described by RFC 2068 Section 2.
1129
1130 In particular, parse comma-separated lists where the elements of
1131 the list may include quoted-strings. A quoted-string could
1132 contain a comma. A non-quoted string could have quotes in the
1133 middle. Neither commas nor quotes count if they are escaped.
1134 Only double-quotes count, not single-quotes.
1135 """
1136 res = []
1137 part = ''
1138
1139 escape = quote = False
1140 for cur in s:
1141 if escape:
1142 part += cur
1143 escape = False
1144 continue
1145 if quote:
1146 if cur == '\\':
1147 escape = True
1148 continue
1149 elif cur == '"':
1150 quote = False
1151 part += cur
1152 continue
1153
1154 if cur == ',':
1155 res.append(part)
1156 part = ''
1157 continue
1158
1159 if cur == '"':
1160 quote = True
1161
1162 part += cur
1163
1164 # append last part
1165 if part:
1166 res.append(part)
1167
1168 return [part.strip() for part in res]
1169
1170class FileHandler(BaseHandler):
1171 # Use local file or FTP depending on form of URL
1172 def file_open(self, req):
1173 url = req.get_selector()
1174 if url[:2] == '//' and url[2:3] != '/':
1175 req.type = 'ftp'
1176 return self.parent.open(req)
1177 else:
1178 return self.open_local_file(req)
1179
1180 # names for the localhost
1181 names = None
1182 def get_names(self):
1183 if FileHandler.names is None:
1184 try:
1185 FileHandler.names = (socket.gethostbyname('localhost'),
1186 socket.gethostbyname(socket.gethostname()))
1187 except socket.gaierror:
1188 FileHandler.names = (socket.gethostbyname('localhost'),)
1189 return FileHandler.names
1190
1191 # not entirely sure what the rules are here
1192 def open_local_file(self, req):
1193 import email.utils
1194 import mimetypes
1195 host = req.get_host()
1196 file = req.get_selector()
1197 localfile = url2pathname(file)
1198 try:
1199 stats = os.stat(localfile)
1200 size = stats.st_size
1201 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1202 mtype = mimetypes.guess_type(file)[0]
1203 headers = email.message_from_string(
1204 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1205 (mtype or 'text/plain', size, modified))
1206 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001207 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001208 if not host or \
1209 (not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl13e89462008-07-01 19:56:00 +00001210 return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001211 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001212 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001213 raise URLError(msg)
1214 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001215
1216def _safe_gethostbyname(host):
1217 try:
1218 return socket.gethostbyname(host)
1219 except socket.gaierror:
1220 return None
1221
1222class FTPHandler(BaseHandler):
1223 def ftp_open(self, req):
1224 import ftplib
1225 import mimetypes
1226 host = req.get_host()
1227 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001228 raise URLError('ftp error: no host given')
1229 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001230 if port is None:
1231 port = ftplib.FTP_PORT
1232 else:
1233 port = int(port)
1234
1235 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001236 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001237 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001238 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001239 else:
1240 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001241 host = unquote(host)
1242 user = unquote(user or '')
1243 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001244
1245 try:
1246 host = socket.gethostbyname(host)
1247 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001248 raise URLError(msg)
1249 path, attrs = splitattr(req.get_selector())
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001250 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001251 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001252 dirs, file = dirs[:-1], dirs[-1]
1253 if dirs and not dirs[0]:
1254 dirs = dirs[1:]
1255 try:
1256 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1257 type = file and 'I' or 'D'
1258 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001259 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001260 if attr.lower() == 'type' and \
1261 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1262 type = value.upper()
1263 fp, retrlen = fw.retrfile(file, type)
1264 headers = ""
1265 mtype = mimetypes.guess_type(req.get_full_url())[0]
1266 if mtype:
1267 headers += "Content-type: %s\n" % mtype
1268 if retrlen is not None and retrlen >= 0:
1269 headers += "Content-length: %d\n" % retrlen
1270 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001271 return addinfourl(fp, headers, req.get_full_url())
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001272 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001273 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001274 raise exc.with_traceback(sys.exc_info()[2])
1275
1276 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1277 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1278 return fw
1279
1280class CacheFTPHandler(FTPHandler):
1281 # XXX would be nice to have pluggable cache strategies
1282 # XXX this stuff is definitely not thread safe
1283 def __init__(self):
1284 self.cache = {}
1285 self.timeout = {}
1286 self.soonest = 0
1287 self.delay = 60
1288 self.max_conns = 16
1289
1290 def setTimeout(self, t):
1291 self.delay = t
1292
1293 def setMaxConns(self, m):
1294 self.max_conns = m
1295
1296 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1297 key = user, host, port, '/'.join(dirs), timeout
1298 if key in self.cache:
1299 self.timeout[key] = time.time() + self.delay
1300 else:
1301 self.cache[key] = ftpwrapper(user, passwd, host, port,
1302 dirs, timeout)
1303 self.timeout[key] = time.time() + self.delay
1304 self.check_cache()
1305 return self.cache[key]
1306
1307 def check_cache(self):
1308 # first check for old ones
1309 t = time.time()
1310 if self.soonest <= t:
1311 for k, v in list(self.timeout.items()):
1312 if v < t:
1313 self.cache[k].close()
1314 del self.cache[k]
1315 del self.timeout[k]
1316 self.soonest = min(list(self.timeout.values()))
1317
1318 # then check the size
1319 if len(self.cache) == self.max_conns:
1320 for k, v in list(self.timeout.items()):
1321 if v == self.soonest:
1322 del self.cache[k]
1323 del self.timeout[k]
1324 break
1325 self.soonest = min(list(self.timeout.values()))
1326
1327# Code move from the old urllib module
1328
1329MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1330
1331# Helper for non-unix systems
1332if os.name == 'mac':
1333 from macurl2path import url2pathname, pathname2url
1334elif os.name == 'nt':
1335 from nturl2path import url2pathname, pathname2url
1336else:
1337 def url2pathname(pathname):
1338 """OS-specific conversion from a relative URL of the 'file' scheme
1339 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001340 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001341
1342 def pathname2url(pathname):
1343 """OS-specific conversion from a file system path to a relative URL
1344 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001345 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001346
1347# This really consists of two pieces:
1348# (1) a class which handles opening of all sorts of URLs
1349# (plus assorted utilities etc.)
1350# (2) a set of functions for parsing URLs
1351# XXX Should these be separated out into different modules?
1352
1353
1354ftpcache = {}
1355class URLopener:
1356 """Class to open URLs.
1357 This is a class rather than just a subroutine because we may need
1358 more than one set of global protocol-specific options.
1359 Note -- this is a base class for those who don't want the
1360 automatic handling of errors type 302 (relocated) and 401
1361 (authorization needed)."""
1362
1363 __tempfiles = None
1364
1365 version = "Python-urllib/%s" % __version__
1366
1367 # Constructor
1368 def __init__(self, proxies=None, **x509):
1369 if proxies is None:
1370 proxies = getproxies()
1371 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1372 self.proxies = proxies
1373 self.key_file = x509.get('key_file')
1374 self.cert_file = x509.get('cert_file')
1375 self.addheaders = [('User-Agent', self.version)]
1376 self.__tempfiles = []
1377 self.__unlink = os.unlink # See cleanup()
1378 self.tempcache = None
1379 # Undocumented feature: if you assign {} to tempcache,
1380 # it is used to cache files retrieved with
1381 # self.retrieve(). This is not enabled by default
1382 # since it does not work for changing documents (and I
1383 # haven't got the logic to check expiration headers
1384 # yet).
1385 self.ftpcache = ftpcache
1386 # Undocumented feature: you can use a different
1387 # ftp cache by assigning to the .ftpcache member;
1388 # in case you want logically independent URL openers
1389 # XXX This is not threadsafe. Bah.
1390
1391 def __del__(self):
1392 self.close()
1393
1394 def close(self):
1395 self.cleanup()
1396
1397 def cleanup(self):
1398 # This code sometimes runs when the rest of this module
1399 # has already been deleted, so it can't use any globals
1400 # or import anything.
1401 if self.__tempfiles:
1402 for file in self.__tempfiles:
1403 try:
1404 self.__unlink(file)
1405 except OSError:
1406 pass
1407 del self.__tempfiles[:]
1408 if self.tempcache:
1409 self.tempcache.clear()
1410
1411 def addheader(self, *args):
1412 """Add a header to be used by the HTTP interface only
1413 e.g. u.addheader('Accept', 'sound/basic')"""
1414 self.addheaders.append(args)
1415
1416 # External interface
1417 def open(self, fullurl, data=None):
1418 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001419 fullurl = unwrap(to_bytes(fullurl))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001420 if self.tempcache and fullurl in self.tempcache:
1421 filename, headers = self.tempcache[fullurl]
1422 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001423 return addinfourl(fp, headers, fullurl)
1424 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001425 if not urltype:
1426 urltype = 'file'
1427 if urltype in self.proxies:
1428 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001429 urltype, proxyhost = splittype(proxy)
1430 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001431 url = (host, fullurl) # Signal special case to open_*()
1432 else:
1433 proxy = None
1434 name = 'open_' + urltype
1435 self.type = urltype
1436 name = name.replace('-', '_')
1437 if not hasattr(self, name):
1438 if proxy:
1439 return self.open_unknown_proxy(proxy, fullurl, data)
1440 else:
1441 return self.open_unknown(fullurl, data)
1442 try:
1443 if data is None:
1444 return getattr(self, name)(url)
1445 else:
1446 return getattr(self, name)(url, data)
1447 except socket.error as msg:
1448 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1449
1450 def open_unknown(self, fullurl, data=None):
1451 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001452 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001453 raise IOError('url error', 'unknown url type', type)
1454
1455 def open_unknown_proxy(self, proxy, fullurl, data=None):
1456 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001457 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001458 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1459
1460 # External interface
1461 def retrieve(self, url, filename=None, reporthook=None, data=None):
1462 """retrieve(url) returns (filename, headers) for a local object
1463 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001464 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001465 if self.tempcache and url in self.tempcache:
1466 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001467 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001468 if filename is None and (not type or type == 'file'):
1469 try:
1470 fp = self.open_local_file(url1)
1471 hdrs = fp.info()
1472 del fp
Georg Brandl13e89462008-07-01 19:56:00 +00001473 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001474 except IOError as msg:
1475 pass
1476 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001477 try:
1478 headers = fp.info()
1479 if filename:
1480 tfp = open(filename, 'wb')
1481 else:
1482 import tempfile
1483 garbage, path = splittype(url)
1484 garbage, path = splithost(path or "")
1485 path, garbage = splitquery(path or "")
1486 path, garbage = splitattr(path or "")
1487 suffix = os.path.splitext(path)[1]
1488 (fd, filename) = tempfile.mkstemp(suffix)
1489 self.__tempfiles.append(filename)
1490 tfp = os.fdopen(fd, 'wb')
1491 try:
1492 result = filename, headers
1493 if self.tempcache is not None:
1494 self.tempcache[url] = result
1495 bs = 1024*8
1496 size = -1
1497 read = 0
1498 blocknum = 0
1499 if reporthook:
1500 if "content-length" in headers:
1501 size = int(headers["Content-Length"])
1502 reporthook(blocknum, bs, size)
1503 while 1:
1504 block = fp.read(bs)
1505 if not block:
1506 break
1507 read += len(block)
1508 tfp.write(block)
1509 blocknum += 1
1510 if reporthook:
1511 reporthook(blocknum, bs, size)
1512 finally:
1513 tfp.close()
1514 finally:
1515 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001516 del fp
1517 del tfp
1518
1519 # raise exception if actual size does not match content-length header
1520 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001521 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001522 "retrieval incomplete: got only %i out of %i bytes"
1523 % (read, size), result)
1524
1525 return result
1526
1527 # Each method named open_<type> knows how to open that type of URL
1528
1529 def _open_generic_http(self, connection_factory, url, data):
1530 """Make an HTTP connection using connection_class.
1531
1532 This is an internal method that should be called from
1533 open_http() or open_https().
1534
1535 Arguments:
1536 - connection_factory should take a host name and return an
1537 HTTPConnection instance.
1538 - url is the url to retrieval or a host, relative-path pair.
1539 - data is payload for a POST request or None.
1540 """
1541
1542 user_passwd = None
1543 proxy_passwd= None
1544 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001545 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001546 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001547 user_passwd, host = splituser(host)
1548 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001549 realhost = host
1550 else:
1551 host, selector = url
1552 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001553 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001554 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001555 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001556 url = rest
1557 user_passwd = None
1558 if urltype.lower() != 'http':
1559 realhost = None
1560 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001561 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001562 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001563 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001564 if user_passwd:
1565 selector = "%s://%s%s" % (urltype, realhost, rest)
1566 if proxy_bypass(realhost):
1567 host = realhost
1568
1569 #print "proxy via http:", host, selector
1570 if not host: raise IOError('http error', 'no host given')
1571
1572 if proxy_passwd:
1573 import base64
1574 proxy_auth = base64.b64encode(proxy_passwd).strip()
1575 else:
1576 proxy_auth = None
1577
1578 if user_passwd:
1579 import base64
1580 auth = base64.b64encode(user_passwd).strip()
1581 else:
1582 auth = None
1583 http_conn = connection_factory(host)
1584 # XXX We should fix urllib so that it works with HTTP/1.1.
1585 http_conn._http_vsn = 10
1586 http_conn._http_vsn_str = "HTTP/1.0"
1587
1588 headers = {}
1589 if proxy_auth:
1590 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1591 if auth:
1592 headers["Authorization"] = "Basic %s" % auth
1593 if realhost:
1594 headers["Host"] = realhost
1595 for header, value in self.addheaders:
1596 headers[header] = value
1597
1598 if data is not None:
1599 headers["Content-Type"] = "application/x-www-form-urlencoded"
1600 http_conn.request("POST", selector, data, headers)
1601 else:
1602 http_conn.request("GET", selector, headers=headers)
1603
1604 try:
1605 response = http_conn.getresponse()
1606 except http.client.BadStatusLine:
1607 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001608 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001609
1610 # According to RFC 2616, "2xx" code indicates that the client's
1611 # request was successfully received, understood, and accepted.
1612 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001613 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001614 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001615 else:
1616 return self.http_error(
1617 url, response.fp,
1618 response.status, response.reason, response.msg, data)
1619
1620 def open_http(self, url, data=None):
1621 """Use HTTP protocol."""
1622 return self._open_generic_http(http.client.HTTPConnection, url, data)
1623
1624 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1625 """Handle http errors.
1626
1627 Derived class can override this, or provide specific handlers
1628 named http_error_DDD where DDD is the 3-digit error code."""
1629 # First check if there's a specific handler for this error
1630 name = 'http_error_%d' % errcode
1631 if hasattr(self, name):
1632 method = getattr(self, name)
1633 if data is None:
1634 result = method(url, fp, errcode, errmsg, headers)
1635 else:
1636 result = method(url, fp, errcode, errmsg, headers, data)
1637 if result: return result
1638 return self.http_error_default(url, fp, errcode, errmsg, headers)
1639
1640 def http_error_default(self, url, fp, errcode, errmsg, headers):
1641 """Default error handler: close the connection and raise IOError."""
1642 void = fp.read()
1643 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001644 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001645
1646 if _have_ssl:
1647 def _https_connection(self, host):
1648 return http.client.HTTPSConnection(host,
1649 key_file=self.key_file,
1650 cert_file=self.cert_file)
1651
1652 def open_https(self, url, data=None):
1653 """Use HTTPS protocol."""
1654 return self._open_generic_http(self._https_connection, url, data)
1655
1656 def open_file(self, url):
1657 """Use local file or FTP depending on form of URL."""
1658 if not isinstance(url, str):
1659 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1660 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1661 return self.open_ftp(url)
1662 else:
1663 return self.open_local_file(url)
1664
1665 def open_local_file(self, url):
1666 """Use local file."""
1667 import mimetypes, email.utils
1668 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001669 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001670 localname = url2pathname(file)
1671 try:
1672 stats = os.stat(localname)
1673 except OSError as e:
1674 raise URLError(e.errno, e.strerror, e.filename)
1675 size = stats.st_size
1676 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1677 mtype = mimetypes.guess_type(url)[0]
1678 headers = email.message_from_string(
1679 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1680 (mtype or 'text/plain', size, modified))
1681 if not host:
1682 urlfile = file
1683 if file[:1] == '/':
1684 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001685 return addinfourl(open(localname, 'rb'), headers, urlfile)
1686 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001687 if (not port
1688 and socket.gethostbyname(host) in (localhost(), thishost())):
1689 urlfile = file
1690 if file[:1] == '/':
1691 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001692 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001693 raise URLError('local file error', 'not on local host')
1694
1695 def open_ftp(self, url):
1696 """Use FTP protocol."""
1697 if not isinstance(url, str):
1698 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1699 import mimetypes
1700 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001701 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001702 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001703 host, port = splitport(host)
1704 user, host = splituser(host)
1705 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001706 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001707 host = unquote(host)
1708 user = unquote(user or '')
1709 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001710 host = socket.gethostbyname(host)
1711 if not port:
1712 import ftplib
1713 port = ftplib.FTP_PORT
1714 else:
1715 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001716 path, attrs = splitattr(path)
1717 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001718 dirs = path.split('/')
1719 dirs, file = dirs[:-1], dirs[-1]
1720 if dirs and not dirs[0]: dirs = dirs[1:]
1721 if dirs and not dirs[0]: dirs[0] = '/'
1722 key = user, host, port, '/'.join(dirs)
1723 # XXX thread unsafe!
1724 if len(self.ftpcache) > MAXFTPCACHE:
1725 # Prune the cache, rather arbitrarily
1726 for k in self.ftpcache.keys():
1727 if k != key:
1728 v = self.ftpcache[k]
1729 del self.ftpcache[k]
1730 v.close()
1731 try:
1732 if not key in self.ftpcache:
1733 self.ftpcache[key] = \
1734 ftpwrapper(user, passwd, host, port, dirs)
1735 if not file: type = 'D'
1736 else: type = 'I'
1737 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001738 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001739 if attr.lower() == 'type' and \
1740 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1741 type = value.upper()
1742 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1743 mtype = mimetypes.guess_type("ftp:" + url)[0]
1744 headers = ""
1745 if mtype:
1746 headers += "Content-Type: %s\n" % mtype
1747 if retrlen is not None and retrlen >= 0:
1748 headers += "Content-Length: %d\n" % retrlen
1749 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001750 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001751 except ftperrors() as msg:
1752 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1753
1754 def open_data(self, url, data=None):
1755 """Use "data" URL."""
1756 if not isinstance(url, str):
1757 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1758 # ignore POSTed data
1759 #
1760 # syntax of data URLs:
1761 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1762 # mediatype := [ type "/" subtype ] *( ";" parameter )
1763 # data := *urlchar
1764 # parameter := attribute "=" value
1765 try:
1766 [type, data] = url.split(',', 1)
1767 except ValueError:
1768 raise IOError('data error', 'bad data URL')
1769 if not type:
1770 type = 'text/plain;charset=US-ASCII'
1771 semi = type.rfind(';')
1772 if semi >= 0 and '=' not in type[semi:]:
1773 encoding = type[semi+1:]
1774 type = type[:semi]
1775 else:
1776 encoding = ''
1777 msg = []
1778 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
1779 time.gmtime(time.time())))
1780 msg.append('Content-type: %s' % type)
1781 if encoding == 'base64':
1782 import base64
1783 data = base64.decodestring(data)
1784 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001785 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001786 msg.append('Content-Length: %d' % len(data))
1787 msg.append('')
1788 msg.append(data)
1789 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001790 headers = email.message_from_string(msg)
1791 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001792 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001793 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001794
1795
1796class FancyURLopener(URLopener):
1797 """Derived class with handlers for errors we can handle (perhaps)."""
1798
1799 def __init__(self, *args, **kwargs):
1800 URLopener.__init__(self, *args, **kwargs)
1801 self.auth_cache = {}
1802 self.tries = 0
1803 self.maxtries = 10
1804
1805 def http_error_default(self, url, fp, errcode, errmsg, headers):
1806 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001807 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001808
1809 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1810 """Error 302 -- relocated (temporarily)."""
1811 self.tries += 1
1812 if self.maxtries and self.tries >= self.maxtries:
1813 if hasattr(self, "http_error_500"):
1814 meth = self.http_error_500
1815 else:
1816 meth = self.http_error_default
1817 self.tries = 0
1818 return meth(url, fp, 500,
1819 "Internal Server Error: Redirect Recursion", headers)
1820 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1821 data)
1822 self.tries = 0
1823 return result
1824
1825 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1826 if 'location' in headers:
1827 newurl = headers['location']
1828 elif 'uri' in headers:
1829 newurl = headers['uri']
1830 else:
1831 return
1832 void = fp.read()
1833 fp.close()
1834 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001835 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001836 return self.open(newurl)
1837
1838 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1839 """Error 301 -- also relocated (permanently)."""
1840 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1841
1842 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1843 """Error 303 -- also relocated (essentially identical to 302)."""
1844 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1845
1846 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1847 """Error 307 -- relocated, but turn POST into error."""
1848 if data is None:
1849 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1850 else:
1851 return self.http_error_default(url, fp, errcode, errmsg, headers)
1852
1853 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
1854 """Error 401 -- authentication required.
1855 This function supports Basic authentication only."""
1856 if not 'www-authenticate' in headers:
1857 URLopener.http_error_default(self, url, fp,
1858 errcode, errmsg, headers)
1859 stuff = headers['www-authenticate']
1860 import re
1861 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1862 if not match:
1863 URLopener.http_error_default(self, url, fp,
1864 errcode, errmsg, headers)
1865 scheme, realm = match.groups()
1866 if scheme.lower() != 'basic':
1867 URLopener.http_error_default(self, url, fp,
1868 errcode, errmsg, headers)
1869 name = 'retry_' + self.type + '_basic_auth'
1870 if data is None:
1871 return getattr(self,name)(url, realm)
1872 else:
1873 return getattr(self,name)(url, realm, data)
1874
1875 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
1876 """Error 407 -- proxy authentication required.
1877 This function supports Basic authentication only."""
1878 if not 'proxy-authenticate' in headers:
1879 URLopener.http_error_default(self, url, fp,
1880 errcode, errmsg, headers)
1881 stuff = headers['proxy-authenticate']
1882 import re
1883 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1884 if not match:
1885 URLopener.http_error_default(self, url, fp,
1886 errcode, errmsg, headers)
1887 scheme, realm = match.groups()
1888 if scheme.lower() != 'basic':
1889 URLopener.http_error_default(self, url, fp,
1890 errcode, errmsg, headers)
1891 name = 'retry_proxy_' + self.type + '_basic_auth'
1892 if data is None:
1893 return getattr(self,name)(url, realm)
1894 else:
1895 return getattr(self,name)(url, realm, data)
1896
1897 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001898 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001899 newurl = 'http://' + host + selector
1900 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001901 urltype, proxyhost = splittype(proxy)
1902 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001903 i = proxyhost.find('@') + 1
1904 proxyhost = proxyhost[i:]
1905 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1906 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001907 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001908 quote(passwd, safe=''), proxyhost)
1909 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1910 if data is None:
1911 return self.open(newurl)
1912 else:
1913 return self.open(newurl, data)
1914
1915 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001916 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001917 newurl = 'https://' + host + selector
1918 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001919 urltype, proxyhost = splittype(proxy)
1920 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001921 i = proxyhost.find('@') + 1
1922 proxyhost = proxyhost[i:]
1923 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1924 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001925 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001926 quote(passwd, safe=''), proxyhost)
1927 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1928 if data is None:
1929 return self.open(newurl)
1930 else:
1931 return self.open(newurl, data)
1932
1933 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001934 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001935 i = host.find('@') + 1
1936 host = host[i:]
1937 user, passwd = self.get_user_passwd(host, realm, i)
1938 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001939 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001940 quote(passwd, safe=''), host)
1941 newurl = 'http://' + host + selector
1942 if data is None:
1943 return self.open(newurl)
1944 else:
1945 return self.open(newurl, data)
1946
1947 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001948 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001949 i = host.find('@') + 1
1950 host = host[i:]
1951 user, passwd = self.get_user_passwd(host, realm, i)
1952 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001953 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001954 quote(passwd, safe=''), host)
1955 newurl = 'https://' + host + selector
1956 if data is None:
1957 return self.open(newurl)
1958 else:
1959 return self.open(newurl, data)
1960
1961 def get_user_passwd(self, host, realm, clear_cache = 0):
1962 key = realm + '@' + host.lower()
1963 if key in self.auth_cache:
1964 if clear_cache:
1965 del self.auth_cache[key]
1966 else:
1967 return self.auth_cache[key]
1968 user, passwd = self.prompt_user_passwd(host, realm)
1969 if user or passwd: self.auth_cache[key] = (user, passwd)
1970 return user, passwd
1971
1972 def prompt_user_passwd(self, host, realm):
1973 """Override this in a GUI environment!"""
1974 import getpass
1975 try:
1976 user = input("Enter username for %s at %s: " % (realm, host))
1977 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
1978 (user, realm, host))
1979 return user, passwd
1980 except KeyboardInterrupt:
1981 print()
1982 return None, None
1983
1984
1985# Utility functions
1986
1987_localhost = None
1988def localhost():
1989 """Return the IP address of the magic hostname 'localhost'."""
1990 global _localhost
1991 if _localhost is None:
1992 _localhost = socket.gethostbyname('localhost')
1993 return _localhost
1994
1995_thishost = None
1996def thishost():
1997 """Return the IP address of the current host."""
1998 global _thishost
1999 if _thishost is None:
2000 _thishost = socket.gethostbyname(socket.gethostname())
2001 return _thishost
2002
2003_ftperrors = None
2004def ftperrors():
2005 """Return the set of errors raised by the FTP class."""
2006 global _ftperrors
2007 if _ftperrors is None:
2008 import ftplib
2009 _ftperrors = ftplib.all_errors
2010 return _ftperrors
2011
2012_noheaders = None
2013def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002014 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002015 global _noheaders
2016 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002017 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002018 return _noheaders
2019
2020
2021# Utility classes
2022
2023class ftpwrapper:
2024 """Class used by open_ftp() for cache of open FTP connections."""
2025
2026 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2027 self.user = user
2028 self.passwd = passwd
2029 self.host = host
2030 self.port = port
2031 self.dirs = dirs
2032 self.timeout = timeout
2033 self.init()
2034
2035 def init(self):
2036 import ftplib
2037 self.busy = 0
2038 self.ftp = ftplib.FTP()
2039 self.ftp.connect(self.host, self.port, self.timeout)
2040 self.ftp.login(self.user, self.passwd)
2041 for dir in self.dirs:
2042 self.ftp.cwd(dir)
2043
2044 def retrfile(self, file, type):
2045 import ftplib
2046 self.endtransfer()
2047 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2048 else: cmd = 'TYPE ' + type; isdir = 0
2049 try:
2050 self.ftp.voidcmd(cmd)
2051 except ftplib.all_errors:
2052 self.init()
2053 self.ftp.voidcmd(cmd)
2054 conn = None
2055 if file and not isdir:
2056 # Try to retrieve as a file
2057 try:
2058 cmd = 'RETR ' + file
2059 conn = self.ftp.ntransfercmd(cmd)
2060 except ftplib.error_perm as reason:
2061 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002062 raise URLError('ftp error', reason).with_traceback(
2063 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002064 if not conn:
2065 # Set transfer mode to ASCII!
2066 self.ftp.voidcmd('TYPE A')
2067 # Try a directory listing. Verify that directory exists.
2068 if file:
2069 pwd = self.ftp.pwd()
2070 try:
2071 try:
2072 self.ftp.cwd(file)
2073 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002074 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002075 finally:
2076 self.ftp.cwd(pwd)
2077 cmd = 'LIST ' + file
2078 else:
2079 cmd = 'LIST'
2080 conn = self.ftp.ntransfercmd(cmd)
2081 self.busy = 1
2082 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002083 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002084 def endtransfer(self):
2085 if not self.busy:
2086 return
2087 self.busy = 0
2088 try:
2089 self.ftp.voidresp()
2090 except ftperrors():
2091 pass
2092
2093 def close(self):
2094 self.endtransfer()
2095 try:
2096 self.ftp.close()
2097 except ftperrors():
2098 pass
2099
2100# Proxy handling
2101def getproxies_environment():
2102 """Return a dictionary of scheme -> proxy server URL mappings.
2103
2104 Scan the environment for variables named <scheme>_proxy;
2105 this seems to be the standard convention. If you need a
2106 different way, you can pass a proxies dictionary to the
2107 [Fancy]URLopener constructor.
2108
2109 """
2110 proxies = {}
2111 for name, value in os.environ.items():
2112 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002113 if value and name[-6:] == '_proxy':
2114 proxies[name[:-6]] = value
2115 return proxies
2116
2117def proxy_bypass_environment(host):
2118 """Test if proxies should not be used for a particular host.
2119
2120 Checks the environment for a variable named no_proxy, which should
2121 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2122 """
2123 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2124 # '*' is special case for always bypass
2125 if no_proxy == '*':
2126 return 1
2127 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002128 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002129 # check if the host ends with any of the DNS suffixes
2130 for name in no_proxy.split(','):
2131 if name and (hostonly.endswith(name) or host.endswith(name)):
2132 return 1
2133 # otherwise, don't bypass
2134 return 0
2135
2136
2137if sys.platform == 'darwin':
2138 def getproxies_internetconfig():
2139 """Return a dictionary of scheme -> proxy server URL mappings.
2140
2141 By convention the mac uses Internet Config to store
2142 proxies. An HTTP proxy, for instance, is stored under
2143 the HttpProxy key.
2144
2145 """
2146 try:
2147 import ic
2148 except ImportError:
2149 return {}
2150
2151 try:
2152 config = ic.IC()
2153 except ic.error:
2154 return {}
2155 proxies = {}
2156 # HTTP:
2157 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
2158 try:
2159 value = config['HTTPProxyHost']
2160 except ic.error:
2161 pass
2162 else:
2163 proxies['http'] = 'http://%s' % value
2164 # FTP: XXX To be done.
2165 # Gopher: XXX To be done.
2166 return proxies
2167
2168 def proxy_bypass(host):
2169 if getproxies_environment():
2170 return proxy_bypass_environment(host)
2171 else:
2172 return 0
2173
2174 def getproxies():
2175 return getproxies_environment() or getproxies_internetconfig()
2176
2177elif os.name == 'nt':
2178 def getproxies_registry():
2179 """Return a dictionary of scheme -> proxy server URL mappings.
2180
2181 Win32 uses the registry to store proxies.
2182
2183 """
2184 proxies = {}
2185 try:
2186 import _winreg
2187 except ImportError:
2188 # Std module, so should be around - but you never know!
2189 return proxies
2190 try:
2191 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2192 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2193 proxyEnable = _winreg.QueryValueEx(internetSettings,
2194 'ProxyEnable')[0]
2195 if proxyEnable:
2196 # Returned as Unicode but problems if not converted to ASCII
2197 proxyServer = str(_winreg.QueryValueEx(internetSettings,
2198 'ProxyServer')[0])
2199 if '=' in proxyServer:
2200 # Per-protocol settings
2201 for p in proxyServer.split(';'):
2202 protocol, address = p.split('=', 1)
2203 # See if address has a type:// prefix
2204 import re
2205 if not re.match('^([^/:]+)://', address):
2206 address = '%s://%s' % (protocol, address)
2207 proxies[protocol] = address
2208 else:
2209 # Use one setting for all protocols
2210 if proxyServer[:5] == 'http:':
2211 proxies['http'] = proxyServer
2212 else:
2213 proxies['http'] = 'http://%s' % proxyServer
2214 proxies['ftp'] = 'ftp://%s' % proxyServer
2215 internetSettings.Close()
2216 except (WindowsError, ValueError, TypeError):
2217 # Either registry key not found etc, or the value in an
2218 # unexpected format.
2219 # proxies already set up to be empty so nothing to do
2220 pass
2221 return proxies
2222
2223 def getproxies():
2224 """Return a dictionary of scheme -> proxy server URL mappings.
2225
2226 Returns settings gathered from the environment, if specified,
2227 or the registry.
2228
2229 """
2230 return getproxies_environment() or getproxies_registry()
2231
2232 def proxy_bypass_registry(host):
2233 try:
2234 import _winreg
2235 import re
2236 except ImportError:
2237 # Std modules, so should be around - but you never know!
2238 return 0
2239 try:
2240 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2241 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2242 proxyEnable = _winreg.QueryValueEx(internetSettings,
2243 'ProxyEnable')[0]
2244 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
2245 'ProxyOverride')[0])
2246 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2247 except WindowsError:
2248 return 0
2249 if not proxyEnable or not proxyOverride:
2250 return 0
2251 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002252 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002253 host = [rawHost]
2254 try:
2255 addr = socket.gethostbyname(rawHost)
2256 if addr != rawHost:
2257 host.append(addr)
2258 except socket.error:
2259 pass
2260 try:
2261 fqdn = socket.getfqdn(rawHost)
2262 if fqdn != rawHost:
2263 host.append(fqdn)
2264 except socket.error:
2265 pass
2266 # make a check value list from the registry entry: replace the
2267 # '<local>' string by the localhost entry and the corresponding
2268 # canonical entry.
2269 proxyOverride = proxyOverride.split(';')
2270 i = 0
2271 while i < len(proxyOverride):
2272 if proxyOverride[i] == '<local>':
2273 proxyOverride[i:i+1] = ['localhost',
2274 '127.0.0.1',
2275 socket.gethostname(),
2276 socket.gethostbyname(
2277 socket.gethostname())]
2278 i += 1
2279 # print proxyOverride
2280 # now check if we match one of the registry values.
2281 for test in proxyOverride:
2282 test = test.replace(".", r"\.") # mask dots
2283 test = test.replace("*", r".*") # change glob sequence
2284 test = test.replace("?", r".") # change glob char
2285 for val in host:
2286 # print "%s <--> %s" %( test, val )
2287 if re.match(test, val, re.I):
2288 return 1
2289 return 0
2290
2291 def proxy_bypass(host):
2292 """Return a dictionary of scheme -> proxy server URL mappings.
2293
2294 Returns settings gathered from the environment, if specified,
2295 or the registry.
2296
2297 """
2298 if getproxies_environment():
2299 return proxy_bypass_environment(host)
2300 else:
2301 return proxy_bypass_registry(host)
2302
2303else:
2304 # By default use environment variables
2305 getproxies = getproxies_environment
2306 proxy_bypass = proxy_bypass_environment