blob: 5c90aeae5beb8e7d573cbb0f5c5cf5eb59bf7fad [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""An extensible library for opening URLs using a variety of protocols
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00002
3The simplest way to use this module is to call the urlopen function,
Tim Peterse1190062001-01-15 03:34:38 +00004which accepts a string containing a URL or a Request object (described
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00005below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
Jeremy Hyltone1906632002-10-11 17:27:55 +00008The OpenerDirector manages a collection of Handler objects that do
Tim Peterse1190062001-01-15 03:34:38 +00009all the actual work. Each Handler implements a particular protocol or
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000010option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
Raymond Hettinger024aaa12003-04-24 15:32:12 +000014HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000016
17urlopen(url, data=None) -- basic usage is that same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
Tim Peterse1190062001-01-15 03:34:38 +000019get a file-like object back. One difference is that you can also pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000020a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- function that creates a new OpenerDirector instance.
25will install the default handlers. accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. if one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- installs a new opener as the default opener.
31
32objects of interest:
33OpenerDirector --
34
35Request -- an object that encapsulates the state of a request. the
36state can be a simple as the URL. it can also include extra HTTP
37headers, e.g. a User-Agent.
38
39BaseHandler --
40
41exceptions:
42URLError-- a subclass of IOError, individual protocols have their own
43specific subclass
44
Tim Peterse1190062001-01-15 03:34:38 +000045HTTPError-- also a valid HTTP response, so you can treat an HTTP error
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000046as an exceptional event or valid response
47
48internals:
49BaseHandler and parent
50_call_chain conventions
51
52Example usage:
53
54import urllib2
55
56# set up authentication info
57authinfo = urllib2.HTTPBasicAuthHandler()
58authinfo.add_password('realm', 'host', 'username', 'password')
59
Moshe Zadka8a18e992001-03-01 08:40:42 +000060proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
61
Tim Peterse1190062001-01-15 03:34:38 +000062# build a new opener that adds authentication and caching FTP handlers
Moshe Zadka8a18e992001-03-01 08:40:42 +000063opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000064
65# install it
66urllib2.install_opener(opener)
67
68f = urllib2.urlopen('http://www.python.org/')
69
70
71"""
72
73# XXX issues:
74# If an authentication error handler that tries to perform
Fred Draked5214b02001-11-08 17:19:29 +000075# authentication for some reason but fails, how should the error be
76# signalled? The client needs to know the HTTP error code. But if
77# the handler knows that the problem was, e.g., that it didn't know
78# that hash algo that requested in the challenge, it would be good to
79# pass that information along to the client, too.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000080
81# XXX to do:
82# name!
83# documentation (getting there)
84# complex proxies
85# abstract factory for opener
86# ftp errors aren't handled cleanly
87# gopher can return a socket.error
88# check digest against correct (i.e. non-apache) implementation
89
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +000090import base64
91import ftplib
92import gopherlib
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000093import httplib
Jeremy Hylton8b78b992001-10-09 16:18:45 +000094import inspect
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000095import md5
96import mimetypes
97import mimetools
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +000098import os
99import posixpath
100import random
101import re
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000102import rfc822
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000103import sha
104import socket
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000105import sys
106import time
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000107import urlparse
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000108
109try:
110 from cStringIO import StringIO
111except ImportError:
112 from StringIO import StringIO
113
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000114# not sure how many of these need to be gotten rid of
115from urllib import unwrap, unquote, splittype, splithost, \
116 addinfourl, splitport, splitgophertype, splitquery, \
117 splitattr, ftpwrapper, noheaders
118
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000119# support for FileHandler, proxies via environment variables
120from urllib import localhost, url2pathname, getproxies
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000121
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000122__version__ = "2.1"
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000123
124_opener = None
125def urlopen(url, data=None):
126 global _opener
127 if _opener is None:
128 _opener = build_opener()
129 return _opener.open(url, data)
130
131def install_opener(opener):
132 global _opener
133 _opener = opener
134
135# do these error classes make sense?
Tim Peterse1190062001-01-15 03:34:38 +0000136# make sure all of the IOError stuff is overridden. we just want to be
Fred Drakea87a5212002-08-13 13:59:55 +0000137# subtypes.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000138
139class URLError(IOError):
140 # URLError is a sub-type of IOError, but it doesn't share any of
Jeremy Hylton0a4a50d2003-10-06 05:15:13 +0000141 # the implementation. need to override __init__ and __str__.
142 # It sets self.args for compatibility with other EnvironmentError
143 # subclasses, but args doesn't have the typical format with errno in
144 # slot 0 and strerror in slot 1. This may be better than nothing.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000145 def __init__(self, reason):
Jeremy Hylton0a4a50d2003-10-06 05:15:13 +0000146 self.args = reason,
Fred Drake13a2c272000-02-10 17:17:14 +0000147 self.reason = reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000148
149 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000150 return '<urlopen error %s>' % self.reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000151
152class HTTPError(URLError, addinfourl):
153 """Raised when HTTP error occurs, but also acts like non-error return"""
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000154 __super_init = addinfourl.__init__
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000155
156 def __init__(self, url, code, msg, hdrs, fp):
Fred Drake13a2c272000-02-10 17:17:14 +0000157 self.code = code
158 self.msg = msg
159 self.hdrs = hdrs
160 self.fp = fp
Fred Drake13a2c272000-02-10 17:17:14 +0000161 self.filename = url
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000162 # The addinfourl classes depend on fp being a valid file
163 # object. In some cases, the HTTPError may not have a valid
164 # file object. If this happens, the simplest workaround is to
Tim Petersc411dba2002-07-16 21:35:23 +0000165 # not initialize the base classes.
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000166 if fp is not None:
167 self.__super_init(fp, hdrs, url)
Tim Peterse1190062001-01-15 03:34:38 +0000168
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000169 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000170 return 'HTTP Error %s: %s' % (self.code, self.msg)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000171
172 def __del__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000173 # XXX is this safe? what if user catches exception, then
174 # extracts fp and discards exception?
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000175 if self.fp:
176 self.fp.close()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000177
178class GopherError(URLError):
179 pass
180
Moshe Zadka8a18e992001-03-01 08:40:42 +0000181
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000182class Request:
Moshe Zadka8a18e992001-03-01 08:40:42 +0000183
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000184 def __init__(self, url, data=None, headers={}):
Fred Drake13a2c272000-02-10 17:17:14 +0000185 # unwrap('<URL:type://host/path>') --> 'type://host/path'
186 self.__original = unwrap(url)
187 self.type = None
188 # self.__r_type is what's left after doing the splittype
189 self.host = None
190 self.port = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000191 self.data = data
Fred Drake13a2c272000-02-10 17:17:14 +0000192 self.headers = {}
Brett Cannonc8b188a2003-05-17 19:51:26 +0000193 for key, value in headers.items():
Brett Cannon86503b12003-05-12 07:29:42 +0000194 self.add_header(key, value)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000195
196 def __getattr__(self, attr):
Fred Drake13a2c272000-02-10 17:17:14 +0000197 # XXX this is a fallback mechanism to guard against these
Tim Peterse1190062001-01-15 03:34:38 +0000198 # methods getting called in a non-standard order. this may be
Fred Drake13a2c272000-02-10 17:17:14 +0000199 # too complicated and/or unnecessary.
200 # XXX should the __r_XXX attributes be public?
201 if attr[:12] == '_Request__r_':
202 name = attr[12:]
203 if hasattr(Request, 'get_' + name):
204 getattr(self, 'get_' + name)()
205 return getattr(self, attr)
206 raise AttributeError, attr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000207
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000208 def get_method(self):
209 if self.has_data():
210 return "POST"
211 else:
212 return "GET"
213
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000214 def add_data(self, data):
215 self.data = data
216
217 def has_data(self):
218 return self.data is not None
219
220 def get_data(self):
221 return self.data
222
223 def get_full_url(self):
224 return self.__original
225
226 def get_type(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000227 if self.type is None:
228 self.type, self.__r_type = splittype(self.__original)
Jeremy Hylton78cae612001-05-09 15:49:24 +0000229 if self.type is None:
230 raise ValueError, "unknown url type: %s" % self.__original
Fred Drake13a2c272000-02-10 17:17:14 +0000231 return self.type
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000232
233 def get_host(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000234 if self.host is None:
235 self.host, self.__r_host = splithost(self.__r_type)
236 if self.host:
237 self.host = unquote(self.host)
238 return self.host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000239
240 def get_selector(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000241 return self.__r_host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000242
Moshe Zadka8a18e992001-03-01 08:40:42 +0000243 def set_proxy(self, host, type):
244 self.host, self.type = host, type
Fred Drake13a2c272000-02-10 17:17:14 +0000245 self.__r_host = self.__original
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000246
247 def add_header(self, key, val):
Fred Drake13a2c272000-02-10 17:17:14 +0000248 # useful for something like authentication
Brett Cannon86503b12003-05-12 07:29:42 +0000249 self.headers[key.capitalize()] = val
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000250
251class OpenerDirector:
252 def __init__(self):
253 server_version = "Python-urllib/%s" % __version__
Brett Cannon783eaf42003-06-17 21:52:34 +0000254 self.addheaders = [('User-agent', server_version)]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000255 # manage the individual handlers
256 self.handlers = []
257 self.handle_open = {}
258 self.handle_error = {}
259
260 def add_handler(self, handler):
261 added = 0
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000262 for meth in dir(handler):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000263 if meth[-5:] == '_open':
264 protocol = meth[:-5]
Raymond Hettinger54f02222002-06-01 14:18:47 +0000265 if protocol in self.handle_open:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000266 self.handle_open[protocol].append(handler)
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000267 self.handle_open[protocol].sort()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000268 else:
269 self.handle_open[protocol] = [handler]
270 added = 1
271 continue
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000272 i = meth.find('_')
273 j = meth[i+1:].find('_') + i + 1
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000274 if j != -1 and meth[i+1:j] == 'error':
275 proto = meth[:i]
276 kind = meth[j+1:]
277 try:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000278 kind = int(kind)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000279 except ValueError:
280 pass
281 dict = self.handle_error.get(proto, {})
Raymond Hettinger54f02222002-06-01 14:18:47 +0000282 if kind in dict:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000283 dict[kind].append(handler)
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000284 dict[kind].sort()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000285 else:
286 dict[kind] = [handler]
287 self.handle_error[proto] = dict
288 added = 1
289 continue
290 if added:
291 self.handlers.append(handler)
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000292 self.handlers.sort()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000293 handler.add_parent(self)
Tim Peterse1190062001-01-15 03:34:38 +0000294
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000295 def __del__(self):
296 self.close()
297
298 def close(self):
299 for handler in self.handlers:
300 handler.close()
301 self.handlers = []
302
303 def _call_chain(self, chain, kind, meth_name, *args):
304 # XXX raise an exception if no one else should try to handle
305 # this url. return None if you can't but someone else could.
306 handlers = chain.get(kind, ())
307 for handler in handlers:
308 func = getattr(handler, meth_name)
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000309
310 result = func(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000311 if result is not None:
312 return result
313
314 def open(self, fullurl, data=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000315 # accept a URL or a Request object
Walter Dörwald65230a22002-06-03 15:58:32 +0000316 if isinstance(fullurl, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000317 req = Request(fullurl, data)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000318 else:
319 req = fullurl
320 if data is not None:
321 req.add_data(data)
Tim Peterse1190062001-01-15 03:34:38 +0000322
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000323 result = self._call_chain(self.handle_open, 'default',
Tim Peterse1190062001-01-15 03:34:38 +0000324 'default_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000325 if result:
326 return result
327
Fred Drake13a2c272000-02-10 17:17:14 +0000328 type_ = req.get_type()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000329 result = self._call_chain(self.handle_open, type_, type_ + \
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000330 '_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000331 if result:
332 return result
333
334 return self._call_chain(self.handle_open, 'unknown',
335 'unknown_open', req)
336
337 def error(self, proto, *args):
Moshe Zadka8a18e992001-03-01 08:40:42 +0000338 if proto in ['http', 'https']:
Fred Draked5214b02001-11-08 17:19:29 +0000339 # XXX http[s] protocols are special-cased
340 dict = self.handle_error['http'] # https is not different than http
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000341 proto = args[2] # YUCK!
342 meth_name = 'http_error_%d' % proto
343 http_err = 1
344 orig_args = args
345 else:
346 dict = self.handle_error
347 meth_name = proto + '_error'
348 http_err = 0
349 args = (dict, proto, meth_name) + args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000350 result = self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000351 if result:
352 return result
353
354 if http_err:
355 args = (dict, 'default', 'http_error_default') + orig_args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000356 return self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000357
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000358# XXX probably also want an abstract factory that knows when it makes
359# sense to skip a superclass in favor of a subclass and when it might
360# make sense to include both
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000361
362def build_opener(*handlers):
363 """Create an opener object from a list of handlers.
364
365 The opener will use several default handlers, including support
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000366 for HTTP and FTP.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000367
368 If any of the handlers passed as arguments are subclasses of the
369 default handlers, the default handlers will not be used.
370 """
Tim Peterse1190062001-01-15 03:34:38 +0000371
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000372 opener = OpenerDirector()
373 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
374 HTTPDefaultErrorHandler, HTTPRedirectHandler,
375 FTPHandler, FileHandler]
Moshe Zadka8a18e992001-03-01 08:40:42 +0000376 if hasattr(httplib, 'HTTPS'):
377 default_classes.append(HTTPSHandler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000378 skip = []
379 for klass in default_classes:
380 for check in handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000381 if inspect.isclass(check):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000382 if issubclass(check, klass):
383 skip.append(klass)
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000384 elif isinstance(check, klass):
385 skip.append(klass)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000386 for klass in skip:
387 default_classes.remove(klass)
388
389 for klass in default_classes:
390 opener.add_handler(klass())
391
392 for h in handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000393 if inspect.isclass(h):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000394 h = h()
395 opener.add_handler(h)
396 return opener
397
398class BaseHandler:
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000399 handler_order = 500
400
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000401 def add_parent(self, parent):
402 self.parent = parent
403 def close(self):
404 self.parent = None
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000405 def __lt__(self, other):
406 if not hasattr(other, "handler_order"):
407 # Try to preserve the old behavior of having custom classes
408 # inserted after default ones (works only for custom user
409 # classes which are not aware of handler_order).
410 return True
411 return self.handler_order < other.handler_order
Tim Petersf545baa2003-06-15 23:26:30 +0000412
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000413
414class HTTPDefaultErrorHandler(BaseHandler):
415 def http_error_default(self, req, fp, code, msg, hdrs):
Fred Drake13a2c272000-02-10 17:17:14 +0000416 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000417
418class HTTPRedirectHandler(BaseHandler):
Jeremy Hylton03892952003-05-05 04:09:13 +0000419 def redirect_request(self, req, fp, code, msg, headers, newurl):
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000420 """Return a Request or None in response to a redirect.
421
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000422 This is called by the http_error_30x methods when a
423 redirection response is received. If a redirection should
424 take place, return a new Request to allow http_error_30x to
425 perform the redirect. Otherwise, raise HTTPError if no-one
426 else should try to handle this url. Return None if you can't
427 but another Handler might.
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000428 """
Jeremy Hylton828023b2003-05-04 23:44:49 +0000429 m = req.get_method()
430 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
Martin v. Löwis162f0812003-07-12 07:33:32 +0000431 or code in (301, 302, 303) and m == "POST"):
432 # Strictly (according to RFC 2616), 301 or 302 in response
433 # to a POST MUST NOT cause a redirection without confirmation
Jeremy Hylton828023b2003-05-04 23:44:49 +0000434 # from the user (of urllib2, in this case). In practice,
435 # essentially all clients do redirect in this case, so we
436 # do the same.
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000437 return Request(newurl, headers=req.headers)
438 else:
Martin v. Löwise3b67bc2003-06-14 05:51:25 +0000439 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000440
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000441 # Implementation note: To avoid the server sending us into an
442 # infinite loop, the request object needs to track what URLs we
443 # have already seen. Do this by adding a handler-specific
444 # attribute to the Request object.
445 def http_error_302(self, req, fp, code, msg, headers):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000446 if 'location' in headers:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000447 newurl = headers['location']
Raymond Hettinger54f02222002-06-01 14:18:47 +0000448 elif 'uri' in headers:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000449 newurl = headers['uri']
450 else:
451 return
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000452 newurl = urlparse.urljoin(req.get_full_url(), newurl)
453
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000454 # XXX Probably want to forget about the state of the current
455 # request, although that might interact poorly with other
456 # handlers that also use handler-specific request attributes
Jeremy Hylton03892952003-05-05 04:09:13 +0000457 new = self.redirect_request(req, fp, code, msg, headers, newurl)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000458 if new is None:
459 return
460
461 # loop detection
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000462 new.error_302_dict = {}
463 if hasattr(req, 'error_302_dict'):
Guido van Rossum2d996c02001-04-15 13:08:01 +0000464 if len(req.error_302_dict)>10 or \
Raymond Hettinger54f02222002-06-01 14:18:47 +0000465 newurl in req.error_302_dict:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000466 raise HTTPError(req.get_full_url(), code,
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000467 self.inf_msg + msg, headers, fp)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000468 new.error_302_dict.update(req.error_302_dict)
469 new.error_302_dict[newurl] = newurl
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000470
471 # Don't close the fp until we are sure that we won't use it
Tim Petersab9ba272001-08-09 21:40:30 +0000472 # with HTTPError.
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000473 fp.read()
474 fp.close()
475
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000476 return self.parent.open(new)
477
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000478 http_error_301 = http_error_303 = http_error_307 = http_error_302
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000479
Martin v. Löwis162f0812003-07-12 07:33:32 +0000480 inf_msg = "The HTTP server returned a redirect error that would " \
Thomas Wouters7e474022000-07-16 12:04:32 +0000481 "lead to an infinite loop.\n" \
Martin v. Löwis162f0812003-07-12 07:33:32 +0000482 "The last 30x error message was:\n"
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000483
484class ProxyHandler(BaseHandler):
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000485 # Proxies must be in front
486 handler_order = 100
487
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000488 def __init__(self, proxies=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000489 if proxies is None:
490 proxies = getproxies()
491 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
492 self.proxies = proxies
Brett Cannondf0d87a2003-05-18 02:25:07 +0000493 for type, url in proxies.items():
Tim Peterse1190062001-01-15 03:34:38 +0000494 setattr(self, '%s_open' % type,
Fred Drake13a2c272000-02-10 17:17:14 +0000495 lambda r, proxy=url, type=type, meth=self.proxy_open: \
496 meth(r, proxy, type))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000497
498 def proxy_open(self, req, proxy, type):
Fred Drake13a2c272000-02-10 17:17:14 +0000499 orig_type = req.get_type()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000500 type, r_type = splittype(proxy)
501 host, XXX = splithost(r_type)
502 if '@' in host:
503 user_pass, host = host.split('@', 1)
Jeremy Hylton144dea32002-07-07 16:57:35 +0000504 if ':' in user_pass:
505 user, password = user_pass.split(':', 1)
Tim Petersc411dba2002-07-16 21:35:23 +0000506 user_pass = base64.encodestring('%s:%s' % (unquote(user),
Jeremy Hylton144dea32002-07-07 16:57:35 +0000507 unquote(password)))
Brett Cannon783eaf42003-06-17 21:52:34 +0000508 req.add_header('Proxy-authorization', 'Basic ' + user_pass)
Moshe Zadka8a18e992001-03-01 08:40:42 +0000509 host = unquote(host)
510 req.set_proxy(host, type)
Fred Drake13a2c272000-02-10 17:17:14 +0000511 if orig_type == type:
512 # let other handlers take care of it
513 # XXX this only makes sense if the proxy is before the
514 # other handlers
515 return None
516 else:
517 # need to start over, because the other handlers don't
518 # grok the proxy's URL type
519 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000520
521# feature suggested by Duncan Booth
522# XXX custom is not a good name
523class CustomProxy:
524 # either pass a function to the constructor or override handle
525 def __init__(self, proto, func=None, proxy_addr=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000526 self.proto = proto
527 self.func = func
528 self.addr = proxy_addr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000529
530 def handle(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000531 if self.func and self.func(req):
532 return 1
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000533
534 def get_proxy(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000535 return self.addr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000536
537class CustomProxyHandler(BaseHandler):
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000538 # Proxies must be in front
539 handler_order = 100
540
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000541 def __init__(self, *proxies):
Fred Drake13a2c272000-02-10 17:17:14 +0000542 self.proxies = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000543
544 def proxy_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000545 proto = req.get_type()
546 try:
547 proxies = self.proxies[proto]
548 except KeyError:
549 return None
550 for p in proxies:
551 if p.handle(req):
552 req.set_proxy(p.get_proxy())
553 return self.parent.open(req)
554 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000555
556 def do_proxy(self, p, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000557 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000558
559 def add_proxy(self, cpo):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000560 if cpo.proto in self.proxies:
Fred Drake13a2c272000-02-10 17:17:14 +0000561 self.proxies[cpo.proto].append(cpo)
562 else:
563 self.proxies[cpo.proto] = [cpo]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000564
565class HTTPPasswordMgr:
566 def __init__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000567 self.passwd = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000568
569 def add_password(self, realm, uri, user, passwd):
Fred Drake13a2c272000-02-10 17:17:14 +0000570 # uri could be a single URI or a sequence
Walter Dörwald65230a22002-06-03 15:58:32 +0000571 if isinstance(uri, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000572 uri = [uri]
573 uri = tuple(map(self.reduce_uri, uri))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000574 if not realm in self.passwd:
Fred Drake13a2c272000-02-10 17:17:14 +0000575 self.passwd[realm] = {}
576 self.passwd[realm][uri] = (user, passwd)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000577
578 def find_user_password(self, realm, authuri):
Fred Drake13a2c272000-02-10 17:17:14 +0000579 domains = self.passwd.get(realm, {})
580 authuri = self.reduce_uri(authuri)
Brett Cannon86503b12003-05-12 07:29:42 +0000581 for uris, authinfo in domains.iteritems():
Fred Drake13a2c272000-02-10 17:17:14 +0000582 for uri in uris:
583 if self.is_suburi(uri, authuri):
584 return authinfo
585 return None, None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000586
587 def reduce_uri(self, uri):
Fred Drake13a2c272000-02-10 17:17:14 +0000588 """Accept netloc or URI and extract only the netloc and path"""
589 parts = urlparse.urlparse(uri)
590 if parts[1]:
591 return parts[1], parts[2] or '/'
592 else:
593 return parts[2], '/'
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000594
595 def is_suburi(self, base, test):
Fred Drake13a2c272000-02-10 17:17:14 +0000596 """Check if test is below base in a URI tree
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000597
Fred Drake13a2c272000-02-10 17:17:14 +0000598 Both args must be URIs in reduced form.
599 """
600 if base == test:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000601 return True
Fred Drake13a2c272000-02-10 17:17:14 +0000602 if base[0] != test[0]:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000603 return False
Moshe Zadka8a18e992001-03-01 08:40:42 +0000604 common = posixpath.commonprefix((base[1], test[1]))
Fred Drake13a2c272000-02-10 17:17:14 +0000605 if len(common) == len(base[1]):
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000606 return True
607 return False
Tim Peterse1190062001-01-15 03:34:38 +0000608
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000609
Moshe Zadka8a18e992001-03-01 08:40:42 +0000610class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
611
612 def find_user_password(self, realm, authuri):
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000613 user, password = HTTPPasswordMgr.find_user_password(self, realm,
614 authuri)
Moshe Zadka8a18e992001-03-01 08:40:42 +0000615 if user is not None:
616 return user, password
617 return HTTPPasswordMgr.find_user_password(self, None, authuri)
618
619
620class AbstractBasicAuthHandler:
621
Neal Norwitz853ddd52002-10-09 23:17:04 +0000622 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000623
624 # XXX there can actually be multiple auth-schemes in a
625 # www-authenticate header. should probably be a lot more careful
626 # in parsing them to extract multiple alternatives
627
Moshe Zadka8a18e992001-03-01 08:40:42 +0000628 def __init__(self, password_mgr=None):
629 if password_mgr is None:
630 password_mgr = HTTPPasswordMgr()
631 self.passwd = password_mgr
Fred Drake13a2c272000-02-10 17:17:14 +0000632 self.add_password = self.passwd.add_password
Tim Peterse1190062001-01-15 03:34:38 +0000633
Moshe Zadka8a18e992001-03-01 08:40:42 +0000634 def http_error_auth_reqed(self, authreq, host, req, headers):
635 # XXX could be multiple headers
636 authreq = headers.get(authreq, None)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000637 if authreq:
Moshe Zadka8a18e992001-03-01 08:40:42 +0000638 mo = AbstractBasicAuthHandler.rx.match(authreq)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000639 if mo:
640 scheme, realm = mo.groups()
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000641 if scheme.lower() == 'basic':
Moshe Zadka8a18e992001-03-01 08:40:42 +0000642 return self.retry_http_basic_auth(host, req, realm)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000643
Moshe Zadka8a18e992001-03-01 08:40:42 +0000644 def retry_http_basic_auth(self, host, req, realm):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000645 user,pw = self.passwd.find_user_password(realm, host)
646 if pw:
Fred Drake13a2c272000-02-10 17:17:14 +0000647 raw = "%s:%s" % (user, pw)
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000648 auth = 'Basic %s' % base64.encodestring(raw).strip()
649 if req.headers.get(self.auth_header, None) == auth:
650 return None
651 req.add_header(self.auth_header, auth)
652 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000653 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000654 return None
655
Moshe Zadka8a18e992001-03-01 08:40:42 +0000656class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000657
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000658 auth_header = 'Authorization'
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000659
Moshe Zadka8a18e992001-03-01 08:40:42 +0000660 def http_error_401(self, req, fp, code, msg, headers):
661 host = urlparse.urlparse(req.get_full_url())[1]
Tim Peters30edd232001-03-16 08:29:48 +0000662 return self.http_error_auth_reqed('www-authenticate',
Moshe Zadka8a18e992001-03-01 08:40:42 +0000663 host, req, headers)
664
665
666class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
667
Brett Cannon783eaf42003-06-17 21:52:34 +0000668 auth_header = 'Proxy-authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000669
670 def http_error_407(self, req, fp, code, msg, headers):
671 host = req.get_host()
Tim Peters30edd232001-03-16 08:29:48 +0000672 return self.http_error_auth_reqed('proxy-authenticate',
Moshe Zadka8a18e992001-03-01 08:40:42 +0000673 host, req, headers)
674
675
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000676def randombytes(n):
677 """Return n random bytes."""
678 # Use /dev/urandom if it is available. Fall back to random module
679 # if not. It might be worthwhile to extend this function to use
680 # other platform-specific mechanisms for getting random bytes.
681 if os.path.exists("/dev/urandom"):
682 f = open("/dev/urandom")
683 s = f.read(n)
684 f.close()
685 return s
686 else:
687 L = [chr(random.randrange(0, 256)) for i in range(n)]
688 return "".join(L)
689
Moshe Zadka8a18e992001-03-01 08:40:42 +0000690class AbstractDigestAuthHandler:
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000691 # Digest authentication is specified in RFC 2617.
692
693 # XXX The client does not inspect the Authentication-Info header
694 # in a successful response.
695
696 # XXX It should be possible to test this implementation against
697 # a mock server that just generates a static set of challenges.
698
699 # XXX qop="auth-int" supports is shaky
Moshe Zadka8a18e992001-03-01 08:40:42 +0000700
701 def __init__(self, passwd=None):
702 if passwd is None:
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000703 passwd = HTTPPasswordMgr()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000704 self.passwd = passwd
Fred Drake13a2c272000-02-10 17:17:14 +0000705 self.add_password = self.passwd.add_password
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000706 self.retried = 0
707 self.nonce_count = 0
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000708
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000709 def reset_retry_count(self):
710 self.retried = 0
711
712 def http_error_auth_reqed(self, auth_header, host, req, headers):
713 authreq = headers.get(auth_header, None)
714 if self.retried > 5:
715 # Don't fail endlessly - if we failed once, we'll probably
716 # fail a second time. Hm. Unless the Password Manager is
717 # prompting for the information. Crap. This isn't great
718 # but it's better than the current 'repeat until recursion
719 # depth exceeded' approach <wink>
720 raise HTTPError(req.get_full_url(), 401, "digest auth failed",
721 headers, None)
722 else:
723 self.retried += 1
Fred Drake13a2c272000-02-10 17:17:14 +0000724 if authreq:
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000725 scheme = authreq.split()[0]
726 if scheme.lower() == 'digest':
Fred Drake13a2c272000-02-10 17:17:14 +0000727 return self.retry_http_digest_auth(req, authreq)
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000728 else:
729 raise ValueError("AbstractDigestAuthHandler doesn't know "
730 "about %s"%(scheme))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000731
732 def retry_http_digest_auth(self, req, auth):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000733 token, challenge = auth.split(' ', 1)
Fred Drake13a2c272000-02-10 17:17:14 +0000734 chal = parse_keqv_list(parse_http_list(challenge))
735 auth = self.get_authorization(req, chal)
736 if auth:
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000737 auth_val = 'Digest %s' % auth
738 if req.headers.get(self.auth_header, None) == auth_val:
739 return None
740 req.add_header(self.auth_header, auth_val)
Fred Drake13a2c272000-02-10 17:17:14 +0000741 resp = self.parent.open(req)
Fred Drake13a2c272000-02-10 17:17:14 +0000742 return resp
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000743
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000744 def get_cnonce(self, nonce):
745 # The cnonce-value is an opaque
746 # quoted string value provided by the client and used by both client
747 # and server to avoid chosen plaintext attacks, to provide mutual
748 # authentication, and to provide some message integrity protection.
749 # This isn't a fabulous effort, but it's probably Good Enough.
750 dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
751 randombytes(8))).hexdigest()
752 return dig[:16]
753
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000754 def get_authorization(self, req, chal):
Fred Drake13a2c272000-02-10 17:17:14 +0000755 try:
756 realm = chal['realm']
757 nonce = chal['nonce']
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000758 qop = chal.get('qop')
Fred Drake13a2c272000-02-10 17:17:14 +0000759 algorithm = chal.get('algorithm', 'MD5')
760 # mod_digest doesn't send an opaque, even though it isn't
761 # supposed to be optional
762 opaque = chal.get('opaque', None)
763 except KeyError:
764 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000765
Fred Drake13a2c272000-02-10 17:17:14 +0000766 H, KD = self.get_algorithm_impls(algorithm)
767 if H is None:
768 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000769
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000770 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
Fred Drake13a2c272000-02-10 17:17:14 +0000771 if user is None:
772 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000773
Fred Drake13a2c272000-02-10 17:17:14 +0000774 # XXX not implemented yet
775 if req.has_data():
776 entdig = self.get_entity_digest(req.get_data(), chal)
777 else:
778 entdig = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000779
Fred Drake13a2c272000-02-10 17:17:14 +0000780 A1 = "%s:%s:%s" % (user, realm, pw)
781 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
782 # XXX selector: what about proxies and full urls
783 req.get_selector())
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000784 if qop == 'auth':
785 self.nonce_count += 1
786 ncvalue = '%08x' % self.nonce_count
787 cnonce = self.get_cnonce(nonce)
788 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
789 respdig = KD(H(A1), noncebit)
790 elif qop is None:
791 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
792 else:
793 # XXX handle auth-int.
794 pass
795
Fred Drake13a2c272000-02-10 17:17:14 +0000796 # XXX should the partial digests be encoded too?
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000797
Fred Drake13a2c272000-02-10 17:17:14 +0000798 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
799 'response="%s"' % (user, realm, nonce, req.get_selector(),
800 respdig)
801 if opaque:
802 base = base + ', opaque="%s"' % opaque
803 if entdig:
804 base = base + ', digest="%s"' % entdig
805 if algorithm != 'MD5':
806 base = base + ', algorithm="%s"' % algorithm
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000807 if qop:
808 base = base + ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
Fred Drake13a2c272000-02-10 17:17:14 +0000809 return base
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000810
811 def get_algorithm_impls(self, algorithm):
Fred Drake13a2c272000-02-10 17:17:14 +0000812 # lambdas assume digest modules are imported at the top level
813 if algorithm == 'MD5':
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000814 H = lambda x: md5.new(x).hexdigest()
Fred Drake13a2c272000-02-10 17:17:14 +0000815 elif algorithm == 'SHA':
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000816 H = lambda x: sha.new(x).hexdigest()
Fred Drake13a2c272000-02-10 17:17:14 +0000817 # XXX MD5-sess
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000818 KD = lambda s, d: H("%s:%s" % (s, d))
Fred Drake13a2c272000-02-10 17:17:14 +0000819 return H, KD
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000820
821 def get_entity_digest(self, data, chal):
Fred Drake13a2c272000-02-10 17:17:14 +0000822 # XXX not implemented yet
823 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000824
Moshe Zadka8a18e992001-03-01 08:40:42 +0000825
826class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
827 """An authentication protocol defined by RFC 2069
828
829 Digest authentication improves on basic authentication because it
830 does not transmit passwords in the clear.
831 """
832
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000833 auth_header = 'Authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000834
835 def http_error_401(self, req, fp, code, msg, headers):
836 host = urlparse.urlparse(req.get_full_url())[1]
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000837 retry = self.http_error_auth_reqed('www-authenticate',
838 host, req, headers)
839 self.reset_retry_count()
840 return retry
Moshe Zadka8a18e992001-03-01 08:40:42 +0000841
842
843class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
844
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000845 auth_header = 'Proxy-Authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000846
847 def http_error_407(self, req, fp, code, msg, headers):
848 host = req.get_host()
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000849 retry = self.http_error_auth_reqed('proxy-authenticate',
850 host, req, headers)
851 self.reset_retry_count()
852 return retry
Tim Peterse1190062001-01-15 03:34:38 +0000853
Moshe Zadka8a18e992001-03-01 08:40:42 +0000854class AbstractHTTPHandler(BaseHandler):
855
Jeremy Hylton828023b2003-05-04 23:44:49 +0000856 # XXX Should rewrite do_open() to use the new httplib interface,
Walter Dörwaldf0dfc7a2003-10-20 14:01:56 +0000857 # would be a little simpler.
Jeremy Hylton828023b2003-05-04 23:44:49 +0000858
Moshe Zadka8a18e992001-03-01 08:40:42 +0000859 def do_open(self, http_class, req):
Moshe Zadka76676802001-04-11 07:44:53 +0000860 host = req.get_host()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000861 if not host:
862 raise URLError('no host given')
863
Jeremy Hylton828023b2003-05-04 23:44:49 +0000864 h = http_class(host) # will parse host:port
865 if req.has_data():
866 data = req.get_data()
867 h.putrequest('POST', req.get_selector())
868 if not 'Content-type' in req.headers:
869 h.putheader('Content-type',
870 'application/x-www-form-urlencoded')
871 if not 'Content-length' in req.headers:
872 h.putheader('Content-length', '%d' % len(data))
873 else:
874 h.putrequest('GET', req.get_selector())
Tim Peterse1190062001-01-15 03:34:38 +0000875
Jeremy Hylton144dea32002-07-07 16:57:35 +0000876 scheme, sel = splittype(req.get_selector())
877 sel_host, sel_path = splithost(sel)
878 h.putheader('Host', sel_host or host)
Brett Cannon783eaf42003-06-17 21:52:34 +0000879 for name, value in self.parent.addheaders:
880 name = name.capitalize()
Jeremy Hylton96f11292002-10-11 17:26:46 +0000881 if name not in req.headers:
Brett Cannon783eaf42003-06-17 21:52:34 +0000882 h.putheader(name, value)
Brett Cannondf0d87a2003-05-18 02:25:07 +0000883 for k, v in req.headers.items():
Fred Drake13a2c272000-02-10 17:17:14 +0000884 h.putheader(k, v)
Jeremy Hyltonf6b444e2003-05-05 01:47:13 +0000885 # httplib will attempt to connect() here. be prepared
886 # to convert a socket error to a URLError.
Jeremy Hylton828023b2003-05-04 23:44:49 +0000887 try:
888 h.endheaders()
889 except socket.error, err:
890 raise URLError(err)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000891 if req.has_data():
Fred Drakeec3dfde2001-07-04 05:18:29 +0000892 h.send(data)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000893
894 code, msg, hdrs = h.getreply()
895 fp = h.getfile()
896 if code == 200:
897 return addinfourl(fp, hdrs, req.get_full_url())
898 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000899 return self.parent.error('http', req, fp, code, msg, hdrs)
900
Moshe Zadka8a18e992001-03-01 08:40:42 +0000901
902class HTTPHandler(AbstractHTTPHandler):
903
904 def http_open(self, req):
905 return self.do_open(httplib.HTTP, req)
906
907
908if hasattr(httplib, 'HTTPS'):
909 class HTTPSHandler(AbstractHTTPHandler):
910
911 def https_open(self, req):
912 return self.do_open(httplib.HTTPS, req)
913
914
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000915class UnknownHandler(BaseHandler):
916 def unknown_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000917 type = req.get_type()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000918 raise URLError('unknown url type: %s' % type)
919
920def parse_keqv_list(l):
921 """Parse list of key=value strings where keys are not duplicated."""
922 parsed = {}
923 for elt in l:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000924 k, v = elt.split('=', 1)
Fred Drake13a2c272000-02-10 17:17:14 +0000925 if v[0] == '"' and v[-1] == '"':
926 v = v[1:-1]
927 parsed[k] = v
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000928 return parsed
929
930def parse_http_list(s):
931 """Parse lists as described by RFC 2068 Section 2.
932
933 In particular, parse comman-separated lists where the elements of
934 the list may include quoted-strings. A quoted-string could
935 contain a comma.
936 """
937 # XXX this function could probably use more testing
938
939 list = []
940 end = len(s)
941 i = 0
942 inquote = 0
943 start = 0
944 while i < end:
Fred Drake13a2c272000-02-10 17:17:14 +0000945 cur = s[i:]
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000946 c = cur.find(',')
947 q = cur.find('"')
Fred Drake13a2c272000-02-10 17:17:14 +0000948 if c == -1:
949 list.append(s[start:])
950 break
951 if q == -1:
952 if inquote:
953 raise ValueError, "unbalanced quotes"
954 else:
955 list.append(s[start:i+c])
956 i = i + c + 1
957 continue
958 if inquote:
959 if q < c:
960 list.append(s[start:i+c])
961 i = i + c + 1
962 start = i
963 inquote = 0
964 else:
Tim Peterse1190062001-01-15 03:34:38 +0000965 i = i + q
Fred Drake13a2c272000-02-10 17:17:14 +0000966 else:
967 if c < q:
968 list.append(s[start:i+c])
969 i = i + c + 1
970 start = i
971 else:
972 inquote = 1
973 i = i + q + 1
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000974 return map(lambda x: x.strip(), list)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000975
976class FileHandler(BaseHandler):
977 # Use local file or FTP depending on form of URL
978 def file_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000979 url = req.get_selector()
980 if url[:2] == '//' and url[2:3] != '/':
981 req.type = 'ftp'
982 return self.parent.open(req)
983 else:
984 return self.open_local_file(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000985
986 # names for the localhost
987 names = None
988 def get_names(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000989 if FileHandler.names is None:
Tim Peterse1190062001-01-15 03:34:38 +0000990 FileHandler.names = (socket.gethostbyname('localhost'),
Fred Drake13a2c272000-02-10 17:17:14 +0000991 socket.gethostbyname(socket.gethostname()))
992 return FileHandler.names
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000993
994 # not entirely sure what the rules are here
995 def open_local_file(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000996 host = req.get_host()
997 file = req.get_selector()
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000998 localfile = url2pathname(file)
999 stats = os.stat(localfile)
Martin v. Löwis9d3eba82002-03-18 08:37:19 +00001000 size = stats.st_size
1001 modified = rfc822.formatdate(stats.st_mtime)
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +00001002 mtype = mimetypes.guess_type(file)[0]
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +00001003 headers = mimetools.Message(StringIO(
Brett Cannon783eaf42003-06-17 21:52:34 +00001004 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +00001005 (mtype or 'text/plain', size, modified)))
Fred Drake13a2c272000-02-10 17:17:14 +00001006 if host:
1007 host, port = splitport(host)
1008 if not host or \
1009 (not port and socket.gethostbyname(host) in self.get_names()):
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +00001010 return addinfourl(open(localfile, 'rb'),
Fred Drake13a2c272000-02-10 17:17:14 +00001011 headers, 'file:'+file)
1012 raise URLError('file not on local host')
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001013
1014class FTPHandler(BaseHandler):
1015 def ftp_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +00001016 host = req.get_host()
1017 if not host:
1018 raise IOError, ('ftp error', 'no host given')
1019 # XXX handle custom username & password
Jeremy Hylton73574ee2000-10-12 18:54:18 +00001020 try:
1021 host = socket.gethostbyname(host)
1022 except socket.error, msg:
1023 raise URLError(msg)
Fred Drake13a2c272000-02-10 17:17:14 +00001024 host, port = splitport(host)
1025 if port is None:
1026 port = ftplib.FTP_PORT
1027 path, attrs = splitattr(req.get_selector())
1028 path = unquote(path)
Eric S. Raymondb08b2d32001-02-09 11:10:16 +00001029 dirs = path.split('/')
Fred Drake13a2c272000-02-10 17:17:14 +00001030 dirs, file = dirs[:-1], dirs[-1]
1031 if dirs and not dirs[0]:
1032 dirs = dirs[1:]
1033 user = passwd = '' # XXX
1034 try:
1035 fw = self.connect_ftp(user, passwd, host, port, dirs)
1036 type = file and 'I' or 'D'
1037 for attr in attrs:
1038 attr, value = splitattr(attr)
Eric S. Raymondb08b2d32001-02-09 11:10:16 +00001039 if attr.lower() == 'type' and \
Fred Drake13a2c272000-02-10 17:17:14 +00001040 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +00001041 type = value.upper()
Fred Drake13a2c272000-02-10 17:17:14 +00001042 fp, retrlen = fw.retrfile(file, type)
Guido van Rossum833a8d82001-08-24 13:10:13 +00001043 headers = ""
1044 mtype = mimetypes.guess_type(req.get_full_url())[0]
1045 if mtype:
Brett Cannon783eaf42003-06-17 21:52:34 +00001046 headers += "Content-type: %s\n" % mtype
Fred Drake13a2c272000-02-10 17:17:14 +00001047 if retrlen is not None and retrlen >= 0:
Brett Cannon783eaf42003-06-17 21:52:34 +00001048 headers += "Content-length: %d\n" % retrlen
Guido van Rossum833a8d82001-08-24 13:10:13 +00001049 sf = StringIO(headers)
1050 headers = mimetools.Message(sf)
Fred Drake13a2c272000-02-10 17:17:14 +00001051 return addinfourl(fp, headers, req.get_full_url())
1052 except ftplib.all_errors, msg:
1053 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001054
1055 def connect_ftp(self, user, passwd, host, port, dirs):
1056 fw = ftpwrapper(user, passwd, host, port, dirs)
1057## fw.ftp.set_debuglevel(1)
1058 return fw
1059
1060class CacheFTPHandler(FTPHandler):
1061 # XXX would be nice to have pluggable cache strategies
1062 # XXX this stuff is definitely not thread safe
1063 def __init__(self):
1064 self.cache = {}
1065 self.timeout = {}
1066 self.soonest = 0
1067 self.delay = 60
Fred Drake13a2c272000-02-10 17:17:14 +00001068 self.max_conns = 16
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001069
1070 def setTimeout(self, t):
1071 self.delay = t
1072
1073 def setMaxConns(self, m):
Fred Drake13a2c272000-02-10 17:17:14 +00001074 self.max_conns = m
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001075
1076 def connect_ftp(self, user, passwd, host, port, dirs):
1077 key = user, passwd, host, port
Raymond Hettinger54f02222002-06-01 14:18:47 +00001078 if key in self.cache:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001079 self.timeout[key] = time.time() + self.delay
1080 else:
1081 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1082 self.timeout[key] = time.time() + self.delay
Fred Drake13a2c272000-02-10 17:17:14 +00001083 self.check_cache()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001084 return self.cache[key]
1085
1086 def check_cache(self):
Fred Drake13a2c272000-02-10 17:17:14 +00001087 # first check for old ones
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001088 t = time.time()
1089 if self.soonest <= t:
Raymond Hettinger4ec4fa22003-05-23 08:51:51 +00001090 for k, v in self.timeout.items():
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001091 if v < t:
1092 self.cache[k].close()
1093 del self.cache[k]
1094 del self.timeout[k]
1095 self.soonest = min(self.timeout.values())
1096
1097 # then check the size
Fred Drake13a2c272000-02-10 17:17:14 +00001098 if len(self.cache) == self.max_conns:
Brett Cannonc8b188a2003-05-17 19:51:26 +00001099 for k, v in self.timeout.items():
Fred Drake13a2c272000-02-10 17:17:14 +00001100 if v == self.soonest:
1101 del self.cache[k]
1102 del self.timeout[k]
1103 break
1104 self.soonest = min(self.timeout.values())
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001105
1106class GopherHandler(BaseHandler):
1107 def gopher_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +00001108 host = req.get_host()
1109 if not host:
1110 raise GopherError('no host given')
1111 host = unquote(host)
1112 selector = req.get_selector()
1113 type, selector = splitgophertype(selector)
1114 selector, query = splitquery(selector)
1115 selector = unquote(selector)
1116 if query:
1117 query = unquote(query)
1118 fp = gopherlib.send_query(selector, query, host)
1119 else:
1120 fp = gopherlib.send_selector(selector, host)
1121 return addinfourl(fp, noheaders(), req.get_full_url())
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001122
1123#bleck! don't use this yet
1124class OpenerFactory:
1125
1126 default_handlers = [UnknownHandler, HTTPHandler,
Tim Peterse1190062001-01-15 03:34:38 +00001127 HTTPDefaultErrorHandler, HTTPRedirectHandler,
Fred Drake13a2c272000-02-10 17:17:14 +00001128 FTPHandler, FileHandler]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001129 handlers = []
1130 replacement_handlers = []
1131
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001132 def add_handler(self, h):
Fred Drake13a2c272000-02-10 17:17:14 +00001133 self.handlers = self.handlers + [h]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001134
1135 def replace_handler(self, h):
Fred Drake13a2c272000-02-10 17:17:14 +00001136 pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001137
1138 def build_opener(self):
Jeremy Hylton54e99e82001-08-07 21:12:25 +00001139 opener = OpenerDirector()
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +00001140 for ph in self.default_handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001141 if inspect.isclass(ph):
Fred Drake13a2c272000-02-10 17:17:14 +00001142 ph = ph()
1143 opener.add_handler(ph)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001144
1145if __name__ == "__main__":
Tim Peterse1190062001-01-15 03:34:38 +00001146 # XXX some of the test code depends on machine configurations that
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001147 # are internal to CNRI. Need to set up a public server with the
1148 # right authentication configuration for test purposes.
1149 if socket.gethostname() == 'bitdiddle':
1150 localhost = 'bitdiddle.cnri.reston.va.us'
Jeremy Hylton73574ee2000-10-12 18:54:18 +00001151 elif socket.gethostname() == 'bitdiddle.concentric.net':
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001152 localhost = 'localhost'
1153 else:
1154 localhost = None
1155 urls = [
Fred Drake13a2c272000-02-10 17:17:14 +00001156 # Thanks to Fred for finding these!
1157 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1158 'gopher://gopher.vt.edu:10010/10/33',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001159
Fred Drake13a2c272000-02-10 17:17:14 +00001160 'file:/etc/passwd',
1161 'file://nonsensename/etc/passwd',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001162 'ftp://www.python.org/pub/python/misc/sousa.au',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001163 'ftp://www.python.org/pub/tmp/blat',
Fred Drake13a2c272000-02-10 17:17:14 +00001164 'http://www.espn.com/', # redirect
1165 'http://www.python.org/Spanish/Inquistion/',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001166 ('http://www.python.org/cgi-bin/faqw.py',
Fred Drake13a2c272000-02-10 17:17:14 +00001167 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1168 'http://www.python.org/',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001169 'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001170 ]
1171
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001172## if localhost is not None:
1173## urls = urls + [
1174## 'file://%s/etc/passwd' % localhost,
1175## 'http://%s/simple/' % localhost,
1176## 'http://%s/digest/' % localhost,
1177## 'http://%s/not/found.h' % localhost,
1178## ]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001179
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001180## bauth = HTTPBasicAuthHandler()
1181## bauth.add_password('basic_test_realm', localhost, 'jhylton',
1182## 'password')
1183## dauth = HTTPDigestAuthHandler()
1184## dauth.add_password('digest_test_realm', localhost, 'jhylton',
1185## 'password')
Tim Peterse1190062001-01-15 03:34:38 +00001186
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001187
1188 cfh = CacheFTPHandler()
1189 cfh.setTimeout(1)
1190
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001191## # XXX try out some custom proxy objects too!
1192## def at_cnri(req):
1193## host = req.get_host()
1194## print host
1195## if host[-18:] == '.cnri.reston.va.us':
1196## return 1
1197## p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1198## ph = CustomProxyHandler(p)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001199
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001200## install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1201 install_opener(build_opener(cfh, GopherHandler))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001202
1203 for url in urls:
Walter Dörwald65230a22002-06-03 15:58:32 +00001204 if isinstance(url, tuple):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001205 url, req = url
1206 else:
1207 req = None
1208 print url
1209 try:
1210 f = urlopen(url, req)
1211 except IOError, err:
Fred Drake13a2c272000-02-10 17:17:14 +00001212 print "IOError:", err
1213 except socket.error, err:
1214 print "socket.error:", err
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001215 else:
1216 buf = f.read()
1217 f.close()
1218 print "read %d bytes" % len(buf)
1219 print
1220 time.sleep(0.1)