blob: 3e700a9d87bf5f7f0e995970cc9e24a6cc6ddc63 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""An extensible library for opening URLs using a variety of protocols
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00002
3The simplest way to use this module is to call the urlopen function,
Tim Peterse1190062001-01-15 03:34:38 +00004which accepts a string containing a URL or a Request object (described
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00005below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
Jeremy Hyltone1906632002-10-11 17:27:55 +00008The OpenerDirector manages a collection of Handler objects that do
Tim Peterse1190062001-01-15 03:34:38 +00009all the actual work. Each Handler implements a particular protocol or
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000010option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
Raymond Hettinger024aaa12003-04-24 15:32:12 +000014HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000016
17urlopen(url, data=None) -- basic usage is that same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
Tim Peterse1190062001-01-15 03:34:38 +000019get a file-like object back. One difference is that you can also pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000020a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- function that creates a new OpenerDirector instance.
25will install the default handlers. accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. if one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- installs a new opener as the default opener.
31
32objects of interest:
33OpenerDirector --
34
35Request -- an object that encapsulates the state of a request. the
36state can be a simple as the URL. it can also include extra HTTP
37headers, e.g. a User-Agent.
38
39BaseHandler --
40
41exceptions:
42URLError-- a subclass of IOError, individual protocols have their own
43specific subclass
44
Tim Peterse1190062001-01-15 03:34:38 +000045HTTPError-- also a valid HTTP response, so you can treat an HTTP error
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000046as an exceptional event or valid response
47
48internals:
49BaseHandler and parent
50_call_chain conventions
51
52Example usage:
53
54import urllib2
55
56# set up authentication info
57authinfo = urllib2.HTTPBasicAuthHandler()
58authinfo.add_password('realm', 'host', 'username', 'password')
59
Moshe Zadka8a18e992001-03-01 08:40:42 +000060proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
61
Tim Peterse1190062001-01-15 03:34:38 +000062# build a new opener that adds authentication and caching FTP handlers
Moshe Zadka8a18e992001-03-01 08:40:42 +000063opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000064
65# install it
66urllib2.install_opener(opener)
67
68f = urllib2.urlopen('http://www.python.org/')
69
70
71"""
72
73# XXX issues:
74# If an authentication error handler that tries to perform
Fred Draked5214b02001-11-08 17:19:29 +000075# authentication for some reason but fails, how should the error be
76# signalled? The client needs to know the HTTP error code. But if
77# the handler knows that the problem was, e.g., that it didn't know
78# that hash algo that requested in the challenge, it would be good to
79# pass that information along to the client, too.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000080
81# XXX to do:
82# name!
83# documentation (getting there)
84# complex proxies
85# abstract factory for opener
86# ftp errors aren't handled cleanly
87# gopher can return a socket.error
88# check digest against correct (i.e. non-apache) implementation
89
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000090import socket
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000091import httplib
Jeremy Hylton8b78b992001-10-09 16:18:45 +000092import inspect
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000093import re
94import base64
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000095import urlparse
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000096import md5
97import mimetypes
98import mimetools
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +000099import rfc822
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000100import ftplib
101import sys
102import time
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000103import os
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000104import gopherlib
Moshe Zadka8a18e992001-03-01 08:40:42 +0000105import posixpath
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000106
107try:
108 from cStringIO import StringIO
109except ImportError:
110 from StringIO import StringIO
111
112try:
113 import sha
114except ImportError:
115 # need 1.5.2 final
116 sha = None
117
118# not sure how many of these need to be gotten rid of
119from urllib import unwrap, unquote, splittype, splithost, \
120 addinfourl, splitport, splitgophertype, splitquery, \
121 splitattr, ftpwrapper, noheaders
122
123# support for proxies via environment variables
124from urllib import getproxies
125
126# support for FileHandler
Moshe Zadka8a18e992001-03-01 08:40:42 +0000127from urllib import localhost, url2pathname
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000128
129__version__ = "2.0a1"
130
131_opener = None
132def urlopen(url, data=None):
133 global _opener
134 if _opener is None:
135 _opener = build_opener()
136 return _opener.open(url, data)
137
138def install_opener(opener):
139 global _opener
140 _opener = opener
141
142# do these error classes make sense?
Tim Peterse1190062001-01-15 03:34:38 +0000143# make sure all of the IOError stuff is overridden. we just want to be
Fred Drakea87a5212002-08-13 13:59:55 +0000144# subtypes.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000145
146class URLError(IOError):
147 # URLError is a sub-type of IOError, but it doesn't share any of
148 # the implementation. need to override __init__ and __str__
149 def __init__(self, reason):
Fred Drake13a2c272000-02-10 17:17:14 +0000150 self.reason = reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000151
152 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000153 return '<urlopen error %s>' % self.reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000154
155class HTTPError(URLError, addinfourl):
156 """Raised when HTTP error occurs, but also acts like non-error return"""
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000157 __super_init = addinfourl.__init__
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000158
159 def __init__(self, url, code, msg, hdrs, fp):
Fred Drake13a2c272000-02-10 17:17:14 +0000160 self.code = code
161 self.msg = msg
162 self.hdrs = hdrs
163 self.fp = fp
Fred Drake13a2c272000-02-10 17:17:14 +0000164 self.filename = url
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000165 # The addinfourl classes depend on fp being a valid file
166 # object. In some cases, the HTTPError may not have a valid
167 # file object. If this happens, the simplest workaround is to
Tim Petersc411dba2002-07-16 21:35:23 +0000168 # not initialize the base classes.
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000169 if fp is not None:
170 self.__super_init(fp, hdrs, url)
Tim Peterse1190062001-01-15 03:34:38 +0000171
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000172 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000173 return 'HTTP Error %s: %s' % (self.code, self.msg)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000174
175 def __del__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000176 # XXX is this safe? what if user catches exception, then
177 # extracts fp and discards exception?
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000178 if self.fp:
179 self.fp.close()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000180
181class GopherError(URLError):
182 pass
183
Moshe Zadka8a18e992001-03-01 08:40:42 +0000184
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000185class Request:
Moshe Zadka8a18e992001-03-01 08:40:42 +0000186
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000187 def __init__(self, url, data=None, headers={}):
Fred Drake13a2c272000-02-10 17:17:14 +0000188 # unwrap('<URL:type://host/path>') --> 'type://host/path'
189 self.__original = unwrap(url)
190 self.type = None
191 # self.__r_type is what's left after doing the splittype
192 self.host = None
193 self.port = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000194 self.data = data
Fred Drake13a2c272000-02-10 17:17:14 +0000195 self.headers = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000196 self.headers.update(headers)
197
198 def __getattr__(self, attr):
Fred Drake13a2c272000-02-10 17:17:14 +0000199 # XXX this is a fallback mechanism to guard against these
Tim Peterse1190062001-01-15 03:34:38 +0000200 # methods getting called in a non-standard order. this may be
Fred Drake13a2c272000-02-10 17:17:14 +0000201 # too complicated and/or unnecessary.
202 # XXX should the __r_XXX attributes be public?
203 if attr[:12] == '_Request__r_':
204 name = attr[12:]
205 if hasattr(Request, 'get_' + name):
206 getattr(self, 'get_' + name)()
207 return getattr(self, attr)
208 raise AttributeError, attr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000209
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000210 def get_method(self):
211 if self.has_data():
212 return "POST"
213 else:
214 return "GET"
215
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000216 def add_data(self, data):
217 self.data = data
218
219 def has_data(self):
220 return self.data is not None
221
222 def get_data(self):
223 return self.data
224
225 def get_full_url(self):
226 return self.__original
227
228 def get_type(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000229 if self.type is None:
230 self.type, self.__r_type = splittype(self.__original)
Jeremy Hylton78cae612001-05-09 15:49:24 +0000231 if self.type is None:
232 raise ValueError, "unknown url type: %s" % self.__original
Fred Drake13a2c272000-02-10 17:17:14 +0000233 return self.type
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000234
235 def get_host(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000236 if self.host is None:
237 self.host, self.__r_host = splithost(self.__r_type)
238 if self.host:
239 self.host = unquote(self.host)
240 return self.host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000241
242 def get_selector(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000243 return self.__r_host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000244
Moshe Zadka8a18e992001-03-01 08:40:42 +0000245 def set_proxy(self, host, type):
246 self.host, self.type = host, type
Fred Drake13a2c272000-02-10 17:17:14 +0000247 self.__r_host = self.__original
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000248
249 def add_header(self, key, val):
Fred Drake13a2c272000-02-10 17:17:14 +0000250 # useful for something like authentication
251 self.headers[key] = val
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000252
253class OpenerDirector:
254 def __init__(self):
255 server_version = "Python-urllib/%s" % __version__
Jeremy Hylton96f11292002-10-11 17:26:46 +0000256 self.addheaders = [('User-Agent', server_version)]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000257 # manage the individual handlers
258 self.handlers = []
259 self.handle_open = {}
260 self.handle_error = {}
261
262 def add_handler(self, handler):
263 added = 0
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000264 for meth in dir(handler):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000265 if meth[-5:] == '_open':
266 protocol = meth[:-5]
Raymond Hettinger54f02222002-06-01 14:18:47 +0000267 if protocol in self.handle_open:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000268 self.handle_open[protocol].append(handler)
269 else:
270 self.handle_open[protocol] = [handler]
271 added = 1
272 continue
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000273 i = meth.find('_')
274 j = meth[i+1:].find('_') + i + 1
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000275 if j != -1 and meth[i+1:j] == 'error':
276 proto = meth[:i]
277 kind = meth[j+1:]
278 try:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000279 kind = int(kind)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000280 except ValueError:
281 pass
282 dict = self.handle_error.get(proto, {})
Raymond Hettinger54f02222002-06-01 14:18:47 +0000283 if kind in dict:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000284 dict[kind].append(handler)
285 else:
286 dict[kind] = [handler]
287 self.handle_error[proto] = dict
288 added = 1
289 continue
290 if added:
291 self.handlers.append(handler)
292 handler.add_parent(self)
Tim Peterse1190062001-01-15 03:34:38 +0000293
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000294 def __del__(self):
295 self.close()
296
297 def close(self):
298 for handler in self.handlers:
299 handler.close()
300 self.handlers = []
301
302 def _call_chain(self, chain, kind, meth_name, *args):
303 # XXX raise an exception if no one else should try to handle
304 # this url. return None if you can't but someone else could.
305 handlers = chain.get(kind, ())
306 for handler in handlers:
307 func = getattr(handler, meth_name)
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000308
309 result = func(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000310 if result is not None:
311 return result
312
313 def open(self, fullurl, data=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000314 # accept a URL or a Request object
Walter Dörwald65230a22002-06-03 15:58:32 +0000315 if isinstance(fullurl, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000316 req = Request(fullurl, data)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000317 else:
318 req = fullurl
319 if data is not None:
320 req.add_data(data)
Fred Drake13a2c272000-02-10 17:17:14 +0000321 assert isinstance(req, Request) # really only care about interface
Tim Peterse1190062001-01-15 03:34:38 +0000322
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000323 result = self._call_chain(self.handle_open, 'default',
Tim Peterse1190062001-01-15 03:34:38 +0000324 'default_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000325 if result:
326 return result
327
Fred Drake13a2c272000-02-10 17:17:14 +0000328 type_ = req.get_type()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000329 result = self._call_chain(self.handle_open, type_, type_ + \
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000330 '_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000331 if result:
332 return result
333
334 return self._call_chain(self.handle_open, 'unknown',
335 'unknown_open', req)
336
337 def error(self, proto, *args):
Moshe Zadka8a18e992001-03-01 08:40:42 +0000338 if proto in ['http', 'https']:
Fred Draked5214b02001-11-08 17:19:29 +0000339 # XXX http[s] protocols are special-cased
340 dict = self.handle_error['http'] # https is not different than http
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000341 proto = args[2] # YUCK!
342 meth_name = 'http_error_%d' % proto
343 http_err = 1
344 orig_args = args
345 else:
346 dict = self.handle_error
347 meth_name = proto + '_error'
348 http_err = 0
349 args = (dict, proto, meth_name) + args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000350 result = self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000351 if result:
352 return result
353
354 if http_err:
355 args = (dict, 'default', 'http_error_default') + orig_args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000356 return self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000357
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000358# XXX probably also want an abstract factory that knows things like
Fred Drakea87a5212002-08-13 13:59:55 +0000359# the fact that a ProxyHandler needs to get inserted first.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000360# would also know when it makes sense to skip a superclass in favor of
Fred Drakea87a5212002-08-13 13:59:55 +0000361# a subclass and when it might make sense to include both
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000362
363def build_opener(*handlers):
364 """Create an opener object from a list of handlers.
365
366 The opener will use several default handlers, including support
367 for HTTP and FTP. If there is a ProxyHandler, it must be at the
368 front of the list of handlers. (Yuck.)
369
370 If any of the handlers passed as arguments are subclasses of the
371 default handlers, the default handlers will not be used.
372 """
Tim Peterse1190062001-01-15 03:34:38 +0000373
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000374 opener = OpenerDirector()
375 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
376 HTTPDefaultErrorHandler, HTTPRedirectHandler,
377 FTPHandler, FileHandler]
Moshe Zadka8a18e992001-03-01 08:40:42 +0000378 if hasattr(httplib, 'HTTPS'):
379 default_classes.append(HTTPSHandler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000380 skip = []
381 for klass in default_classes:
382 for check in handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000383 if inspect.isclass(check):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000384 if issubclass(check, klass):
385 skip.append(klass)
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000386 elif isinstance(check, klass):
387 skip.append(klass)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000388 for klass in skip:
389 default_classes.remove(klass)
390
391 for klass in default_classes:
392 opener.add_handler(klass())
393
394 for h in handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000395 if inspect.isclass(h):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000396 h = h()
397 opener.add_handler(h)
398 return opener
399
400class BaseHandler:
401 def add_parent(self, parent):
402 self.parent = parent
403 def close(self):
404 self.parent = None
405
406class HTTPDefaultErrorHandler(BaseHandler):
407 def http_error_default(self, req, fp, code, msg, hdrs):
Fred Drake13a2c272000-02-10 17:17:14 +0000408 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000409
410class HTTPRedirectHandler(BaseHandler):
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000411 def redirect_request(self, req, fp, code, msg, headers):
412 """Return a Request or None in response to a redirect.
413
414 This is called by the http_error_30x methods when a redirection
415 response is received. If a redirection should take place, return a new
416 Request to allow http_error_30x to perform the redirect. Otherwise,
417 raise HTTPError if no-one else should try to handle this url. Return
418 None if you can't but another Handler might.
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000419 """
Jeremy Hylton828023b2003-05-04 23:44:49 +0000420 # XXX 301 and 302 errors must have a location or uri header.
421 # Not sure about the other error codes.
422 if "location" in headers:
423 newurl = headers["location"]
424 elif "uri" in headers:
425 newurl = headers["uri"]
426 else:
427 return
428 newurl = urlparse.urljoin(req.get_full_url(), newurl)
429
430 m = req.get_method()
431 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
432 or code in (302, 303) and m == "POST"):
433 # Strictly (according to RFC 2616), 302 in response to a
434 # POST MUST NOT cause a redirection without confirmation
435 # from the user (of urllib2, in this case). In practice,
436 # essentially all clients do redirect in this case, so we
437 # do the same.
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000438 return Request(newurl, headers=req.headers)
439 else:
440 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
441
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000442 # Implementation note: To avoid the server sending us into an
443 # infinite loop, the request object needs to track what URLs we
444 # have already seen. Do this by adding a handler-specific
445 # attribute to the Request object.
446 def http_error_302(self, req, fp, code, msg, headers):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000447 if 'location' in headers:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000448 newurl = headers['location']
Raymond Hettinger54f02222002-06-01 14:18:47 +0000449 elif 'uri' in headers:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000450 newurl = headers['uri']
451 else:
452 return
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000453 newurl = urlparse.urljoin(req.get_full_url(), newurl)
454
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000455 # XXX Probably want to forget about the state of the current
456 # request, although that might interact poorly with other
457 # handlers that also use handler-specific request attributes
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000458 new = self.redirect_request(req, fp, code, msg, headers)
459 if new is None:
460 return
461
462 # loop detection
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000463 new.error_302_dict = {}
464 if hasattr(req, 'error_302_dict'):
Guido van Rossum2d996c02001-04-15 13:08:01 +0000465 if len(req.error_302_dict)>10 or \
Raymond Hettinger54f02222002-06-01 14:18:47 +0000466 newurl in req.error_302_dict:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000467 raise HTTPError(req.get_full_url(), code,
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000468 self.inf_msg + msg, headers, fp)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000469 new.error_302_dict.update(req.error_302_dict)
470 new.error_302_dict[newurl] = newurl
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000471
472 # Don't close the fp until we are sure that we won't use it
Tim Petersab9ba272001-08-09 21:40:30 +0000473 # with HTTPError.
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000474 fp.read()
475 fp.close()
476
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000477 return self.parent.open(new)
478
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000479 http_error_301 = http_error_303 = http_error_307 = http_error_302
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000480
481 inf_msg = "The HTTP server returned a redirect error that would" \
Thomas Wouters7e474022000-07-16 12:04:32 +0000482 "lead to an infinite loop.\n" \
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000483 "The last 302 error message was:\n"
484
485class ProxyHandler(BaseHandler):
486 def __init__(self, proxies=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000487 if proxies is None:
488 proxies = getproxies()
489 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
490 self.proxies = proxies
491 for type, url in proxies.items():
Tim Peterse1190062001-01-15 03:34:38 +0000492 setattr(self, '%s_open' % type,
Fred Drake13a2c272000-02-10 17:17:14 +0000493 lambda r, proxy=url, type=type, meth=self.proxy_open: \
494 meth(r, proxy, type))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000495
496 def proxy_open(self, req, proxy, type):
Fred Drake13a2c272000-02-10 17:17:14 +0000497 orig_type = req.get_type()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000498 type, r_type = splittype(proxy)
499 host, XXX = splithost(r_type)
500 if '@' in host:
501 user_pass, host = host.split('@', 1)
Jeremy Hylton144dea32002-07-07 16:57:35 +0000502 if ':' in user_pass:
503 user, password = user_pass.split(':', 1)
Tim Petersc411dba2002-07-16 21:35:23 +0000504 user_pass = base64.encodestring('%s:%s' % (unquote(user),
Jeremy Hylton144dea32002-07-07 16:57:35 +0000505 unquote(password)))
506 req.add_header('Proxy-Authorization', 'Basic ' + user_pass)
Moshe Zadka8a18e992001-03-01 08:40:42 +0000507 host = unquote(host)
508 req.set_proxy(host, type)
Fred Drake13a2c272000-02-10 17:17:14 +0000509 if orig_type == type:
510 # let other handlers take care of it
511 # XXX this only makes sense if the proxy is before the
512 # other handlers
513 return None
514 else:
515 # need to start over, because the other handlers don't
516 # grok the proxy's URL type
517 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000518
519# feature suggested by Duncan Booth
520# XXX custom is not a good name
521class CustomProxy:
522 # either pass a function to the constructor or override handle
523 def __init__(self, proto, func=None, proxy_addr=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000524 self.proto = proto
525 self.func = func
526 self.addr = proxy_addr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000527
528 def handle(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000529 if self.func and self.func(req):
530 return 1
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000531
532 def get_proxy(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000533 return self.addr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000534
535class CustomProxyHandler(BaseHandler):
536 def __init__(self, *proxies):
Fred Drake13a2c272000-02-10 17:17:14 +0000537 self.proxies = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000538
539 def proxy_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000540 proto = req.get_type()
541 try:
542 proxies = self.proxies[proto]
543 except KeyError:
544 return None
545 for p in proxies:
546 if p.handle(req):
547 req.set_proxy(p.get_proxy())
548 return self.parent.open(req)
549 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000550
551 def do_proxy(self, p, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000552 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000553
554 def add_proxy(self, cpo):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000555 if cpo.proto in self.proxies:
Fred Drake13a2c272000-02-10 17:17:14 +0000556 self.proxies[cpo.proto].append(cpo)
557 else:
558 self.proxies[cpo.proto] = [cpo]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000559
560class HTTPPasswordMgr:
561 def __init__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000562 self.passwd = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000563
564 def add_password(self, realm, uri, user, passwd):
Fred Drake13a2c272000-02-10 17:17:14 +0000565 # uri could be a single URI or a sequence
Walter Dörwald65230a22002-06-03 15:58:32 +0000566 if isinstance(uri, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000567 uri = [uri]
568 uri = tuple(map(self.reduce_uri, uri))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000569 if not realm in self.passwd:
Fred Drake13a2c272000-02-10 17:17:14 +0000570 self.passwd[realm] = {}
571 self.passwd[realm][uri] = (user, passwd)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000572
573 def find_user_password(self, realm, authuri):
Fred Drake13a2c272000-02-10 17:17:14 +0000574 domains = self.passwd.get(realm, {})
575 authuri = self.reduce_uri(authuri)
576 for uris, authinfo in domains.items():
577 for uri in uris:
578 if self.is_suburi(uri, authuri):
579 return authinfo
580 return None, None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000581
582 def reduce_uri(self, uri):
Fred Drake13a2c272000-02-10 17:17:14 +0000583 """Accept netloc or URI and extract only the netloc and path"""
584 parts = urlparse.urlparse(uri)
585 if parts[1]:
586 return parts[1], parts[2] or '/'
587 else:
588 return parts[2], '/'
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000589
590 def is_suburi(self, base, test):
Fred Drake13a2c272000-02-10 17:17:14 +0000591 """Check if test is below base in a URI tree
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000592
Fred Drake13a2c272000-02-10 17:17:14 +0000593 Both args must be URIs in reduced form.
594 """
595 if base == test:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000596 return True
Fred Drake13a2c272000-02-10 17:17:14 +0000597 if base[0] != test[0]:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000598 return False
Moshe Zadka8a18e992001-03-01 08:40:42 +0000599 common = posixpath.commonprefix((base[1], test[1]))
Fred Drake13a2c272000-02-10 17:17:14 +0000600 if len(common) == len(base[1]):
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000601 return True
602 return False
Tim Peterse1190062001-01-15 03:34:38 +0000603
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000604
Moshe Zadka8a18e992001-03-01 08:40:42 +0000605class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
606
607 def find_user_password(self, realm, authuri):
608 user, password = HTTPPasswordMgr.find_user_password(self,realm,authuri)
609 if user is not None:
610 return user, password
611 return HTTPPasswordMgr.find_user_password(self, None, authuri)
612
613
614class AbstractBasicAuthHandler:
615
Neal Norwitz853ddd52002-10-09 23:17:04 +0000616 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000617
618 # XXX there can actually be multiple auth-schemes in a
619 # www-authenticate header. should probably be a lot more careful
620 # in parsing them to extract multiple alternatives
621
Moshe Zadka8a18e992001-03-01 08:40:42 +0000622 def __init__(self, password_mgr=None):
623 if password_mgr is None:
624 password_mgr = HTTPPasswordMgr()
625 self.passwd = password_mgr
Fred Drake13a2c272000-02-10 17:17:14 +0000626 self.add_password = self.passwd.add_password
Tim Peterse1190062001-01-15 03:34:38 +0000627
Moshe Zadka8a18e992001-03-01 08:40:42 +0000628 def http_error_auth_reqed(self, authreq, host, req, headers):
629 # XXX could be multiple headers
630 authreq = headers.get(authreq, None)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000631 if authreq:
Moshe Zadka8a18e992001-03-01 08:40:42 +0000632 mo = AbstractBasicAuthHandler.rx.match(authreq)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000633 if mo:
634 scheme, realm = mo.groups()
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000635 if scheme.lower() == 'basic':
Moshe Zadka8a18e992001-03-01 08:40:42 +0000636 return self.retry_http_basic_auth(host, req, realm)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000637
Moshe Zadka8a18e992001-03-01 08:40:42 +0000638 def retry_http_basic_auth(self, host, req, realm):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000639 user,pw = self.passwd.find_user_password(realm, host)
640 if pw:
Fred Drake13a2c272000-02-10 17:17:14 +0000641 raw = "%s:%s" % (user, pw)
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000642 auth = 'Basic %s' % base64.encodestring(raw).strip()
643 if req.headers.get(self.auth_header, None) == auth:
644 return None
645 req.add_header(self.auth_header, auth)
646 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000647 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000648 return None
649
Moshe Zadka8a18e992001-03-01 08:40:42 +0000650class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000651
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000652 auth_header = 'Authorization'
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000653
Moshe Zadka8a18e992001-03-01 08:40:42 +0000654 def http_error_401(self, req, fp, code, msg, headers):
655 host = urlparse.urlparse(req.get_full_url())[1]
Tim Peters30edd232001-03-16 08:29:48 +0000656 return self.http_error_auth_reqed('www-authenticate',
Moshe Zadka8a18e992001-03-01 08:40:42 +0000657 host, req, headers)
658
659
660class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
661
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000662 auth_header = 'Proxy-Authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000663
664 def http_error_407(self, req, fp, code, msg, headers):
665 host = req.get_host()
Tim Peters30edd232001-03-16 08:29:48 +0000666 return self.http_error_auth_reqed('proxy-authenticate',
Moshe Zadka8a18e992001-03-01 08:40:42 +0000667 host, req, headers)
668
669
670class AbstractDigestAuthHandler:
671
672 def __init__(self, passwd=None):
673 if passwd is None:
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000674 passwd = HTTPPasswordMgr()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000675 self.passwd = passwd
Fred Drake13a2c272000-02-10 17:17:14 +0000676 self.add_password = self.passwd.add_password
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000677
Moshe Zadka8a18e992001-03-01 08:40:42 +0000678 def http_error_auth_reqed(self, authreq, host, req, headers):
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000679 authreq = headers.get(self.auth_header, None)
Fred Drake13a2c272000-02-10 17:17:14 +0000680 if authreq:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000681 kind = authreq.split()[0]
Fred Drake13a2c272000-02-10 17:17:14 +0000682 if kind == 'Digest':
683 return self.retry_http_digest_auth(req, authreq)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000684
685 def retry_http_digest_auth(self, req, auth):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000686 token, challenge = auth.split(' ', 1)
Fred Drake13a2c272000-02-10 17:17:14 +0000687 chal = parse_keqv_list(parse_http_list(challenge))
688 auth = self.get_authorization(req, chal)
689 if auth:
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000690 auth_val = 'Digest %s' % auth
691 if req.headers.get(self.auth_header, None) == auth_val:
692 return None
693 req.add_header(self.auth_header, auth_val)
Fred Drake13a2c272000-02-10 17:17:14 +0000694 resp = self.parent.open(req)
Fred Drake13a2c272000-02-10 17:17:14 +0000695 return resp
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000696
697 def get_authorization(self, req, chal):
Fred Drake13a2c272000-02-10 17:17:14 +0000698 try:
699 realm = chal['realm']
700 nonce = chal['nonce']
701 algorithm = chal.get('algorithm', 'MD5')
702 # mod_digest doesn't send an opaque, even though it isn't
703 # supposed to be optional
704 opaque = chal.get('opaque', None)
705 except KeyError:
706 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000707
Fred Drake13a2c272000-02-10 17:17:14 +0000708 H, KD = self.get_algorithm_impls(algorithm)
709 if H is None:
710 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000711
Fred Drake13a2c272000-02-10 17:17:14 +0000712 user, pw = self.passwd.find_user_password(realm,
Tim Peterse1190062001-01-15 03:34:38 +0000713 req.get_full_url())
Fred Drake13a2c272000-02-10 17:17:14 +0000714 if user is None:
715 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000716
Fred Drake13a2c272000-02-10 17:17:14 +0000717 # XXX not implemented yet
718 if req.has_data():
719 entdig = self.get_entity_digest(req.get_data(), chal)
720 else:
721 entdig = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000722
Fred Drake13a2c272000-02-10 17:17:14 +0000723 A1 = "%s:%s:%s" % (user, realm, pw)
724 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
725 # XXX selector: what about proxies and full urls
726 req.get_selector())
727 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
728 # XXX should the partial digests be encoded too?
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000729
Fred Drake13a2c272000-02-10 17:17:14 +0000730 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
731 'response="%s"' % (user, realm, nonce, req.get_selector(),
732 respdig)
733 if opaque:
734 base = base + ', opaque="%s"' % opaque
735 if entdig:
736 base = base + ', digest="%s"' % entdig
737 if algorithm != 'MD5':
738 base = base + ', algorithm="%s"' % algorithm
739 return base
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000740
741 def get_algorithm_impls(self, algorithm):
Fred Drake13a2c272000-02-10 17:17:14 +0000742 # lambdas assume digest modules are imported at the top level
743 if algorithm == 'MD5':
744 H = lambda x, e=encode_digest:e(md5.new(x).digest())
745 elif algorithm == 'SHA':
746 H = lambda x, e=encode_digest:e(sha.new(x).digest())
747 # XXX MD5-sess
748 KD = lambda s, d, H=H: H("%s:%s" % (s, d))
749 return H, KD
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000750
751 def get_entity_digest(self, data, chal):
Fred Drake13a2c272000-02-10 17:17:14 +0000752 # XXX not implemented yet
753 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000754
Moshe Zadka8a18e992001-03-01 08:40:42 +0000755
756class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
757 """An authentication protocol defined by RFC 2069
758
759 Digest authentication improves on basic authentication because it
760 does not transmit passwords in the clear.
761 """
762
763 header = 'Authorization'
764
765 def http_error_401(self, req, fp, code, msg, headers):
766 host = urlparse.urlparse(req.get_full_url())[1]
767 self.http_error_auth_reqed('www-authenticate', host, req, headers)
768
769
770class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
771
772 header = 'Proxy-Authorization'
773
774 def http_error_407(self, req, fp, code, msg, headers):
775 host = req.get_host()
776 self.http_error_auth_reqed('proxy-authenticate', host, req, headers)
777
778
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000779def encode_digest(digest):
780 hexrep = []
781 for c in digest:
Fred Drake13a2c272000-02-10 17:17:14 +0000782 n = (ord(c) >> 4) & 0xf
783 hexrep.append(hex(n)[-1])
784 n = ord(c) & 0xf
785 hexrep.append(hex(n)[-1])
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000786 return ''.join(hexrep)
Tim Peterse1190062001-01-15 03:34:38 +0000787
788
Moshe Zadka8a18e992001-03-01 08:40:42 +0000789class AbstractHTTPHandler(BaseHandler):
790
Jeremy Hylton828023b2003-05-04 23:44:49 +0000791 # XXX Should rewrite do_open() to use the new httplib interface,
792 # would would be a little simpler.
793
Moshe Zadka8a18e992001-03-01 08:40:42 +0000794 def do_open(self, http_class, req):
Moshe Zadka76676802001-04-11 07:44:53 +0000795 host = req.get_host()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000796 if not host:
797 raise URLError('no host given')
798
Jeremy Hylton828023b2003-05-04 23:44:49 +0000799 h = http_class(host) # will parse host:port
800 if req.has_data():
801 data = req.get_data()
802 h.putrequest('POST', req.get_selector())
803 if not 'Content-type' in req.headers:
804 h.putheader('Content-type',
805 'application/x-www-form-urlencoded')
806 if not 'Content-length' in req.headers:
807 h.putheader('Content-length', '%d' % len(data))
808 else:
809 h.putrequest('GET', req.get_selector())
Tim Peterse1190062001-01-15 03:34:38 +0000810
Jeremy Hylton144dea32002-07-07 16:57:35 +0000811 scheme, sel = splittype(req.get_selector())
812 sel_host, sel_path = splithost(sel)
813 h.putheader('Host', sel_host or host)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000814 for args in self.parent.addheaders:
Andrew M. Kuchlingaca49b02002-11-06 15:40:05 +0000815 name, value = args
Jeremy Hylton96f11292002-10-11 17:26:46 +0000816 if name not in req.headers:
817 h.putheader(*args)
Fred Drake13a2c272000-02-10 17:17:14 +0000818 for k, v in req.headers.items():
819 h.putheader(k, v)
Jeremy Hyltonf6b444e2003-05-05 01:47:13 +0000820 # httplib will attempt to connect() here. be prepared
821 # to convert a socket error to a URLError.
Jeremy Hylton828023b2003-05-04 23:44:49 +0000822 try:
823 h.endheaders()
824 except socket.error, err:
825 raise URLError(err)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000826 if req.has_data():
Fred Drakeec3dfde2001-07-04 05:18:29 +0000827 h.send(data)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000828
829 code, msg, hdrs = h.getreply()
830 fp = h.getfile()
831 if code == 200:
832 return addinfourl(fp, hdrs, req.get_full_url())
833 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000834 return self.parent.error('http', req, fp, code, msg, hdrs)
835
Moshe Zadka8a18e992001-03-01 08:40:42 +0000836
837class HTTPHandler(AbstractHTTPHandler):
838
839 def http_open(self, req):
840 return self.do_open(httplib.HTTP, req)
841
842
843if hasattr(httplib, 'HTTPS'):
844 class HTTPSHandler(AbstractHTTPHandler):
845
846 def https_open(self, req):
847 return self.do_open(httplib.HTTPS, req)
848
849
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000850class UnknownHandler(BaseHandler):
851 def unknown_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000852 type = req.get_type()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000853 raise URLError('unknown url type: %s' % type)
854
855def parse_keqv_list(l):
856 """Parse list of key=value strings where keys are not duplicated."""
857 parsed = {}
858 for elt in l:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000859 k, v = elt.split('=', 1)
Fred Drake13a2c272000-02-10 17:17:14 +0000860 if v[0] == '"' and v[-1] == '"':
861 v = v[1:-1]
862 parsed[k] = v
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000863 return parsed
864
865def parse_http_list(s):
866 """Parse lists as described by RFC 2068 Section 2.
867
868 In particular, parse comman-separated lists where the elements of
869 the list may include quoted-strings. A quoted-string could
870 contain a comma.
871 """
872 # XXX this function could probably use more testing
873
874 list = []
875 end = len(s)
876 i = 0
877 inquote = 0
878 start = 0
879 while i < end:
Fred Drake13a2c272000-02-10 17:17:14 +0000880 cur = s[i:]
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000881 c = cur.find(',')
882 q = cur.find('"')
Fred Drake13a2c272000-02-10 17:17:14 +0000883 if c == -1:
884 list.append(s[start:])
885 break
886 if q == -1:
887 if inquote:
888 raise ValueError, "unbalanced quotes"
889 else:
890 list.append(s[start:i+c])
891 i = i + c + 1
892 continue
893 if inquote:
894 if q < c:
895 list.append(s[start:i+c])
896 i = i + c + 1
897 start = i
898 inquote = 0
899 else:
Tim Peterse1190062001-01-15 03:34:38 +0000900 i = i + q
Fred Drake13a2c272000-02-10 17:17:14 +0000901 else:
902 if c < q:
903 list.append(s[start:i+c])
904 i = i + c + 1
905 start = i
906 else:
907 inquote = 1
908 i = i + q + 1
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000909 return map(lambda x: x.strip(), list)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000910
911class FileHandler(BaseHandler):
912 # Use local file or FTP depending on form of URL
913 def file_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000914 url = req.get_selector()
915 if url[:2] == '//' and url[2:3] != '/':
916 req.type = 'ftp'
917 return self.parent.open(req)
918 else:
919 return self.open_local_file(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000920
921 # names for the localhost
922 names = None
923 def get_names(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000924 if FileHandler.names is None:
Tim Peterse1190062001-01-15 03:34:38 +0000925 FileHandler.names = (socket.gethostbyname('localhost'),
Fred Drake13a2c272000-02-10 17:17:14 +0000926 socket.gethostbyname(socket.gethostname()))
927 return FileHandler.names
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000928
929 # not entirely sure what the rules are here
930 def open_local_file(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000931 host = req.get_host()
932 file = req.get_selector()
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000933 localfile = url2pathname(file)
934 stats = os.stat(localfile)
Martin v. Löwis9d3eba82002-03-18 08:37:19 +0000935 size = stats.st_size
936 modified = rfc822.formatdate(stats.st_mtime)
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000937 mtype = mimetypes.guess_type(file)[0]
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000938 headers = mimetools.Message(StringIO(
939 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
940 (mtype or 'text/plain', size, modified)))
Fred Drake13a2c272000-02-10 17:17:14 +0000941 if host:
942 host, port = splitport(host)
943 if not host or \
944 (not port and socket.gethostbyname(host) in self.get_names()):
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000945 return addinfourl(open(localfile, 'rb'),
Fred Drake13a2c272000-02-10 17:17:14 +0000946 headers, 'file:'+file)
947 raise URLError('file not on local host')
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000948
949class FTPHandler(BaseHandler):
950 def ftp_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000951 host = req.get_host()
952 if not host:
953 raise IOError, ('ftp error', 'no host given')
954 # XXX handle custom username & password
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000955 try:
956 host = socket.gethostbyname(host)
957 except socket.error, msg:
958 raise URLError(msg)
Fred Drake13a2c272000-02-10 17:17:14 +0000959 host, port = splitport(host)
960 if port is None:
961 port = ftplib.FTP_PORT
962 path, attrs = splitattr(req.get_selector())
963 path = unquote(path)
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000964 dirs = path.split('/')
Fred Drake13a2c272000-02-10 17:17:14 +0000965 dirs, file = dirs[:-1], dirs[-1]
966 if dirs and not dirs[0]:
967 dirs = dirs[1:]
968 user = passwd = '' # XXX
969 try:
970 fw = self.connect_ftp(user, passwd, host, port, dirs)
971 type = file and 'I' or 'D'
972 for attr in attrs:
973 attr, value = splitattr(attr)
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000974 if attr.lower() == 'type' and \
Fred Drake13a2c272000-02-10 17:17:14 +0000975 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000976 type = value.upper()
Fred Drake13a2c272000-02-10 17:17:14 +0000977 fp, retrlen = fw.retrfile(file, type)
Guido van Rossum833a8d82001-08-24 13:10:13 +0000978 headers = ""
979 mtype = mimetypes.guess_type(req.get_full_url())[0]
980 if mtype:
981 headers += "Content-Type: %s\n" % mtype
Fred Drake13a2c272000-02-10 17:17:14 +0000982 if retrlen is not None and retrlen >= 0:
Guido van Rossum833a8d82001-08-24 13:10:13 +0000983 headers += "Content-Length: %d\n" % retrlen
984 sf = StringIO(headers)
985 headers = mimetools.Message(sf)
Fred Drake13a2c272000-02-10 17:17:14 +0000986 return addinfourl(fp, headers, req.get_full_url())
987 except ftplib.all_errors, msg:
988 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000989
990 def connect_ftp(self, user, passwd, host, port, dirs):
991 fw = ftpwrapper(user, passwd, host, port, dirs)
992## fw.ftp.set_debuglevel(1)
993 return fw
994
995class CacheFTPHandler(FTPHandler):
996 # XXX would be nice to have pluggable cache strategies
997 # XXX this stuff is definitely not thread safe
998 def __init__(self):
999 self.cache = {}
1000 self.timeout = {}
1001 self.soonest = 0
1002 self.delay = 60
Fred Drake13a2c272000-02-10 17:17:14 +00001003 self.max_conns = 16
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001004
1005 def setTimeout(self, t):
1006 self.delay = t
1007
1008 def setMaxConns(self, m):
Fred Drake13a2c272000-02-10 17:17:14 +00001009 self.max_conns = m
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001010
1011 def connect_ftp(self, user, passwd, host, port, dirs):
1012 key = user, passwd, host, port
Raymond Hettinger54f02222002-06-01 14:18:47 +00001013 if key in self.cache:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001014 self.timeout[key] = time.time() + self.delay
1015 else:
1016 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1017 self.timeout[key] = time.time() + self.delay
Fred Drake13a2c272000-02-10 17:17:14 +00001018 self.check_cache()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001019 return self.cache[key]
1020
1021 def check_cache(self):
Fred Drake13a2c272000-02-10 17:17:14 +00001022 # first check for old ones
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001023 t = time.time()
1024 if self.soonest <= t:
1025 for k, v in self.timeout.items():
1026 if v < t:
1027 self.cache[k].close()
1028 del self.cache[k]
1029 del self.timeout[k]
1030 self.soonest = min(self.timeout.values())
1031
1032 # then check the size
Fred Drake13a2c272000-02-10 17:17:14 +00001033 if len(self.cache) == self.max_conns:
1034 for k, v in self.timeout.items():
1035 if v == self.soonest:
1036 del self.cache[k]
1037 del self.timeout[k]
1038 break
1039 self.soonest = min(self.timeout.values())
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001040
1041class GopherHandler(BaseHandler):
1042 def gopher_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +00001043 host = req.get_host()
1044 if not host:
1045 raise GopherError('no host given')
1046 host = unquote(host)
1047 selector = req.get_selector()
1048 type, selector = splitgophertype(selector)
1049 selector, query = splitquery(selector)
1050 selector = unquote(selector)
1051 if query:
1052 query = unquote(query)
1053 fp = gopherlib.send_query(selector, query, host)
1054 else:
1055 fp = gopherlib.send_selector(selector, host)
1056 return addinfourl(fp, noheaders(), req.get_full_url())
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001057
1058#bleck! don't use this yet
1059class OpenerFactory:
1060
1061 default_handlers = [UnknownHandler, HTTPHandler,
Tim Peterse1190062001-01-15 03:34:38 +00001062 HTTPDefaultErrorHandler, HTTPRedirectHandler,
Fred Drake13a2c272000-02-10 17:17:14 +00001063 FTPHandler, FileHandler]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001064 proxy_handlers = [ProxyHandler]
1065 handlers = []
1066 replacement_handlers = []
1067
1068 def add_proxy_handler(self, ph):
Fred Drake13a2c272000-02-10 17:17:14 +00001069 self.proxy_handlers = self.proxy_handlers + [ph]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001070
1071 def add_handler(self, h):
Fred Drake13a2c272000-02-10 17:17:14 +00001072 self.handlers = self.handlers + [h]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001073
1074 def replace_handler(self, h):
Fred Drake13a2c272000-02-10 17:17:14 +00001075 pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001076
1077 def build_opener(self):
Jeremy Hylton54e99e82001-08-07 21:12:25 +00001078 opener = OpenerDirector()
Fred Drake13a2c272000-02-10 17:17:14 +00001079 for ph in self.proxy_handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001080 if inspect.isclass(ph):
Fred Drake13a2c272000-02-10 17:17:14 +00001081 ph = ph()
1082 opener.add_handler(ph)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001083
1084if __name__ == "__main__":
Tim Peterse1190062001-01-15 03:34:38 +00001085 # XXX some of the test code depends on machine configurations that
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001086 # are internal to CNRI. Need to set up a public server with the
1087 # right authentication configuration for test purposes.
1088 if socket.gethostname() == 'bitdiddle':
1089 localhost = 'bitdiddle.cnri.reston.va.us'
Jeremy Hylton73574ee2000-10-12 18:54:18 +00001090 elif socket.gethostname() == 'bitdiddle.concentric.net':
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001091 localhost = 'localhost'
1092 else:
1093 localhost = None
1094 urls = [
Fred Drake13a2c272000-02-10 17:17:14 +00001095 # Thanks to Fred for finding these!
1096 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1097 'gopher://gopher.vt.edu:10010/10/33',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001098
Fred Drake13a2c272000-02-10 17:17:14 +00001099 'file:/etc/passwd',
1100 'file://nonsensename/etc/passwd',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001101 'ftp://www.python.org/pub/python/misc/sousa.au',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001102 'ftp://www.python.org/pub/tmp/blat',
Fred Drake13a2c272000-02-10 17:17:14 +00001103 'http://www.espn.com/', # redirect
1104 'http://www.python.org/Spanish/Inquistion/',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001105 ('http://www.python.org/cgi-bin/faqw.py',
Fred Drake13a2c272000-02-10 17:17:14 +00001106 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1107 'http://www.python.org/',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001108 'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001109 ]
1110
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001111## if localhost is not None:
1112## urls = urls + [
1113## 'file://%s/etc/passwd' % localhost,
1114## 'http://%s/simple/' % localhost,
1115## 'http://%s/digest/' % localhost,
1116## 'http://%s/not/found.h' % localhost,
1117## ]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001118
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001119## bauth = HTTPBasicAuthHandler()
1120## bauth.add_password('basic_test_realm', localhost, 'jhylton',
1121## 'password')
1122## dauth = HTTPDigestAuthHandler()
1123## dauth.add_password('digest_test_realm', localhost, 'jhylton',
1124## 'password')
Tim Peterse1190062001-01-15 03:34:38 +00001125
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001126
1127 cfh = CacheFTPHandler()
1128 cfh.setTimeout(1)
1129
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001130## # XXX try out some custom proxy objects too!
1131## def at_cnri(req):
1132## host = req.get_host()
1133## print host
1134## if host[-18:] == '.cnri.reston.va.us':
1135## return 1
1136## p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1137## ph = CustomProxyHandler(p)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001138
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001139## install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1140 install_opener(build_opener(cfh, GopherHandler))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001141
1142 for url in urls:
Walter Dörwald65230a22002-06-03 15:58:32 +00001143 if isinstance(url, tuple):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001144 url, req = url
1145 else:
1146 req = None
1147 print url
1148 try:
1149 f = urlopen(url, req)
1150 except IOError, err:
Fred Drake13a2c272000-02-10 17:17:14 +00001151 print "IOError:", err
1152 except socket.error, err:
1153 print "socket.error:", err
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001154 else:
1155 buf = f.read()
1156 f.close()
1157 print "read %d bytes" % len(buf)
1158 print
1159 time.sleep(0.1)