blob: df3729ab80c9e050c75cf276e8dd189c2cf944a2 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""An extensible library for opening URLs using a variety of protocols
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00002
3The simplest way to use this module is to call the urlopen function,
Tim Peterse1190062001-01-15 03:34:38 +00004which accepts a string containing a URL or a Request object (described
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00005below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
Jeremy Hyltone1906632002-10-11 17:27:55 +00008The OpenerDirector manages a collection of Handler objects that do
Tim Peterse1190062001-01-15 03:34:38 +00009all the actual work. Each Handler implements a particular protocol or
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000010option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
Raymond Hettinger024aaa12003-04-24 15:32:12 +000014HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000016
17urlopen(url, data=None) -- basic usage is that same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
Tim Peterse1190062001-01-15 03:34:38 +000019get a file-like object back. One difference is that you can also pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000020a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- function that creates a new OpenerDirector instance.
25will install the default handlers. accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. if one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- installs a new opener as the default opener.
31
32objects of interest:
33OpenerDirector --
34
35Request -- an object that encapsulates the state of a request. the
36state can be a simple as the URL. it can also include extra HTTP
37headers, e.g. a User-Agent.
38
39BaseHandler --
40
41exceptions:
42URLError-- a subclass of IOError, individual protocols have their own
43specific subclass
44
Tim Peterse1190062001-01-15 03:34:38 +000045HTTPError-- also a valid HTTP response, so you can treat an HTTP error
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000046as an exceptional event or valid response
47
48internals:
49BaseHandler and parent
50_call_chain conventions
51
52Example usage:
53
54import urllib2
55
56# set up authentication info
57authinfo = urllib2.HTTPBasicAuthHandler()
58authinfo.add_password('realm', 'host', 'username', 'password')
59
Moshe Zadka8a18e992001-03-01 08:40:42 +000060proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
61
Tim Peterse1190062001-01-15 03:34:38 +000062# build a new opener that adds authentication and caching FTP handlers
Moshe Zadka8a18e992001-03-01 08:40:42 +000063opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000064
65# install it
66urllib2.install_opener(opener)
67
68f = urllib2.urlopen('http://www.python.org/')
69
70
71"""
72
73# XXX issues:
74# If an authentication error handler that tries to perform
Fred Draked5214b02001-11-08 17:19:29 +000075# authentication for some reason but fails, how should the error be
76# signalled? The client needs to know the HTTP error code. But if
77# the handler knows that the problem was, e.g., that it didn't know
78# that hash algo that requested in the challenge, it would be good to
79# pass that information along to the client, too.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000080
81# XXX to do:
82# name!
83# documentation (getting there)
84# complex proxies
85# abstract factory for opener
86# ftp errors aren't handled cleanly
87# gopher can return a socket.error
88# check digest against correct (i.e. non-apache) implementation
89
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000090import socket
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000091import httplib
Jeremy Hylton8b78b992001-10-09 16:18:45 +000092import inspect
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000093import re
94import base64
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000095import urlparse
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000096import md5
97import mimetypes
98import mimetools
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +000099import rfc822
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000100import ftplib
101import sys
102import time
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000103import os
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000104import gopherlib
Moshe Zadka8a18e992001-03-01 08:40:42 +0000105import posixpath
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000106
107try:
108 from cStringIO import StringIO
109except ImportError:
110 from StringIO import StringIO
111
112try:
113 import sha
114except ImportError:
115 # need 1.5.2 final
116 sha = None
117
118# not sure how many of these need to be gotten rid of
119from urllib import unwrap, unquote, splittype, splithost, \
120 addinfourl, splitport, splitgophertype, splitquery, \
121 splitattr, ftpwrapper, noheaders
122
123# support for proxies via environment variables
124from urllib import getproxies
125
126# support for FileHandler
Moshe Zadka8a18e992001-03-01 08:40:42 +0000127from urllib import localhost, url2pathname
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000128
129__version__ = "2.0a1"
130
131_opener = None
132def urlopen(url, data=None):
133 global _opener
134 if _opener is None:
135 _opener = build_opener()
136 return _opener.open(url, data)
137
138def install_opener(opener):
139 global _opener
140 _opener = opener
141
142# do these error classes make sense?
Tim Peterse1190062001-01-15 03:34:38 +0000143# make sure all of the IOError stuff is overridden. we just want to be
Fred Drakea87a5212002-08-13 13:59:55 +0000144# subtypes.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000145
146class URLError(IOError):
147 # URLError is a sub-type of IOError, but it doesn't share any of
148 # the implementation. need to override __init__ and __str__
149 def __init__(self, reason):
Fred Drake13a2c272000-02-10 17:17:14 +0000150 self.reason = reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000151
152 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000153 return '<urlopen error %s>' % self.reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000154
155class HTTPError(URLError, addinfourl):
156 """Raised when HTTP error occurs, but also acts like non-error return"""
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000157 __super_init = addinfourl.__init__
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000158
159 def __init__(self, url, code, msg, hdrs, fp):
Fred Drake13a2c272000-02-10 17:17:14 +0000160 self.code = code
161 self.msg = msg
162 self.hdrs = hdrs
163 self.fp = fp
Fred Drake13a2c272000-02-10 17:17:14 +0000164 self.filename = url
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000165 # The addinfourl classes depend on fp being a valid file
166 # object. In some cases, the HTTPError may not have a valid
167 # file object. If this happens, the simplest workaround is to
Tim Petersc411dba2002-07-16 21:35:23 +0000168 # not initialize the base classes.
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000169 if fp is not None:
170 self.__super_init(fp, hdrs, url)
Tim Peterse1190062001-01-15 03:34:38 +0000171
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000172 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000173 return 'HTTP Error %s: %s' % (self.code, self.msg)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000174
175 def __del__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000176 # XXX is this safe? what if user catches exception, then
177 # extracts fp and discards exception?
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000178 if self.fp:
179 self.fp.close()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000180
181class GopherError(URLError):
182 pass
183
Moshe Zadka8a18e992001-03-01 08:40:42 +0000184
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000185class Request:
Moshe Zadka8a18e992001-03-01 08:40:42 +0000186
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000187 def __init__(self, url, data=None, headers={}):
Fred Drake13a2c272000-02-10 17:17:14 +0000188 # unwrap('<URL:type://host/path>') --> 'type://host/path'
189 self.__original = unwrap(url)
190 self.type = None
191 # self.__r_type is what's left after doing the splittype
192 self.host = None
193 self.port = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000194 self.data = data
Fred Drake13a2c272000-02-10 17:17:14 +0000195 self.headers = {}
Brett Cannonc8b188a2003-05-17 19:51:26 +0000196 for key, value in headers.items():
Brett Cannon86503b12003-05-12 07:29:42 +0000197 self.add_header(key, value)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000198
199 def __getattr__(self, attr):
Fred Drake13a2c272000-02-10 17:17:14 +0000200 # XXX this is a fallback mechanism to guard against these
Tim Peterse1190062001-01-15 03:34:38 +0000201 # methods getting called in a non-standard order. this may be
Fred Drake13a2c272000-02-10 17:17:14 +0000202 # too complicated and/or unnecessary.
203 # XXX should the __r_XXX attributes be public?
204 if attr[:12] == '_Request__r_':
205 name = attr[12:]
206 if hasattr(Request, 'get_' + name):
207 getattr(self, 'get_' + name)()
208 return getattr(self, attr)
209 raise AttributeError, attr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000210
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000211 def get_method(self):
212 if self.has_data():
213 return "POST"
214 else:
215 return "GET"
216
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000217 def add_data(self, data):
218 self.data = data
219
220 def has_data(self):
221 return self.data is not None
222
223 def get_data(self):
224 return self.data
225
226 def get_full_url(self):
227 return self.__original
228
229 def get_type(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000230 if self.type is None:
231 self.type, self.__r_type = splittype(self.__original)
Jeremy Hylton78cae612001-05-09 15:49:24 +0000232 if self.type is None:
233 raise ValueError, "unknown url type: %s" % self.__original
Fred Drake13a2c272000-02-10 17:17:14 +0000234 return self.type
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000235
236 def get_host(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000237 if self.host is None:
238 self.host, self.__r_host = splithost(self.__r_type)
239 if self.host:
240 self.host = unquote(self.host)
241 return self.host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000242
243 def get_selector(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000244 return self.__r_host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000245
Moshe Zadka8a18e992001-03-01 08:40:42 +0000246 def set_proxy(self, host, type):
247 self.host, self.type = host, type
Fred Drake13a2c272000-02-10 17:17:14 +0000248 self.__r_host = self.__original
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000249
250 def add_header(self, key, val):
Fred Drake13a2c272000-02-10 17:17:14 +0000251 # useful for something like authentication
Brett Cannon86503b12003-05-12 07:29:42 +0000252 self.headers[key.capitalize()] = val
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000253
254class OpenerDirector:
255 def __init__(self):
256 server_version = "Python-urllib/%s" % __version__
Brett Cannon783eaf42003-06-17 21:52:34 +0000257 self.addheaders = [('User-agent', server_version)]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000258 # manage the individual handlers
259 self.handlers = []
260 self.handle_open = {}
261 self.handle_error = {}
262
263 def add_handler(self, handler):
264 added = 0
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000265 for meth in dir(handler):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000266 if meth[-5:] == '_open':
267 protocol = meth[:-5]
Raymond Hettinger54f02222002-06-01 14:18:47 +0000268 if protocol in self.handle_open:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000269 self.handle_open[protocol].append(handler)
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000270 self.handle_open[protocol].sort()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000271 else:
272 self.handle_open[protocol] = [handler]
273 added = 1
274 continue
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000275 i = meth.find('_')
276 j = meth[i+1:].find('_') + i + 1
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000277 if j != -1 and meth[i+1:j] == 'error':
278 proto = meth[:i]
279 kind = meth[j+1:]
280 try:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000281 kind = int(kind)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000282 except ValueError:
283 pass
284 dict = self.handle_error.get(proto, {})
Raymond Hettinger54f02222002-06-01 14:18:47 +0000285 if kind in dict:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000286 dict[kind].append(handler)
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000287 dict[kind].sort()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000288 else:
289 dict[kind] = [handler]
290 self.handle_error[proto] = dict
291 added = 1
292 continue
293 if added:
294 self.handlers.append(handler)
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000295 self.handlers.sort()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000296 handler.add_parent(self)
Tim Peterse1190062001-01-15 03:34:38 +0000297
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000298 def __del__(self):
299 self.close()
300
301 def close(self):
302 for handler in self.handlers:
303 handler.close()
304 self.handlers = []
305
306 def _call_chain(self, chain, kind, meth_name, *args):
307 # XXX raise an exception if no one else should try to handle
308 # this url. return None if you can't but someone else could.
309 handlers = chain.get(kind, ())
310 for handler in handlers:
311 func = getattr(handler, meth_name)
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000312
313 result = func(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000314 if result is not None:
315 return result
316
317 def open(self, fullurl, data=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000318 # accept a URL or a Request object
Walter Dörwald65230a22002-06-03 15:58:32 +0000319 if isinstance(fullurl, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000320 req = Request(fullurl, data)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000321 else:
322 req = fullurl
323 if data is not None:
324 req.add_data(data)
Tim Peterse1190062001-01-15 03:34:38 +0000325
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000326 result = self._call_chain(self.handle_open, 'default',
Tim Peterse1190062001-01-15 03:34:38 +0000327 'default_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000328 if result:
329 return result
330
Fred Drake13a2c272000-02-10 17:17:14 +0000331 type_ = req.get_type()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000332 result = self._call_chain(self.handle_open, type_, type_ + \
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000333 '_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000334 if result:
335 return result
336
337 return self._call_chain(self.handle_open, 'unknown',
338 'unknown_open', req)
339
340 def error(self, proto, *args):
Moshe Zadka8a18e992001-03-01 08:40:42 +0000341 if proto in ['http', 'https']:
Fred Draked5214b02001-11-08 17:19:29 +0000342 # XXX http[s] protocols are special-cased
343 dict = self.handle_error['http'] # https is not different than http
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000344 proto = args[2] # YUCK!
345 meth_name = 'http_error_%d' % proto
346 http_err = 1
347 orig_args = args
348 else:
349 dict = self.handle_error
350 meth_name = proto + '_error'
351 http_err = 0
352 args = (dict, proto, meth_name) + args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000353 result = self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000354 if result:
355 return result
356
357 if http_err:
358 args = (dict, 'default', 'http_error_default') + orig_args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000359 return self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000360
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000361# XXX probably also want an abstract factory that knows when it makes
362# sense to skip a superclass in favor of a subclass and when it might
363# make sense to include both
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000364
365def build_opener(*handlers):
366 """Create an opener object from a list of handlers.
367
368 The opener will use several default handlers, including support
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000369 for HTTP and FTP.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000370
371 If any of the handlers passed as arguments are subclasses of the
372 default handlers, the default handlers will not be used.
373 """
Tim Peterse1190062001-01-15 03:34:38 +0000374
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000375 opener = OpenerDirector()
376 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
377 HTTPDefaultErrorHandler, HTTPRedirectHandler,
378 FTPHandler, FileHandler]
Moshe Zadka8a18e992001-03-01 08:40:42 +0000379 if hasattr(httplib, 'HTTPS'):
380 default_classes.append(HTTPSHandler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000381 skip = []
382 for klass in default_classes:
383 for check in handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000384 if inspect.isclass(check):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000385 if issubclass(check, klass):
386 skip.append(klass)
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000387 elif isinstance(check, klass):
388 skip.append(klass)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000389 for klass in skip:
390 default_classes.remove(klass)
391
392 for klass in default_classes:
393 opener.add_handler(klass())
394
395 for h in handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000396 if inspect.isclass(h):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000397 h = h()
398 opener.add_handler(h)
399 return opener
400
401class BaseHandler:
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000402 handler_order = 500
403
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000404 def add_parent(self, parent):
405 self.parent = parent
406 def close(self):
407 self.parent = None
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000408 def __lt__(self, other):
409 if not hasattr(other, "handler_order"):
410 # Try to preserve the old behavior of having custom classes
411 # inserted after default ones (works only for custom user
412 # classes which are not aware of handler_order).
413 return True
414 return self.handler_order < other.handler_order
Tim Petersf545baa2003-06-15 23:26:30 +0000415
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000416
417class HTTPDefaultErrorHandler(BaseHandler):
418 def http_error_default(self, req, fp, code, msg, hdrs):
Fred Drake13a2c272000-02-10 17:17:14 +0000419 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000420
421class HTTPRedirectHandler(BaseHandler):
Jeremy Hylton03892952003-05-05 04:09:13 +0000422 def redirect_request(self, req, fp, code, msg, headers, newurl):
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000423 """Return a Request or None in response to a redirect.
424
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000425 This is called by the http_error_30x methods when a
426 redirection response is received. If a redirection should
427 take place, return a new Request to allow http_error_30x to
428 perform the redirect. Otherwise, raise HTTPError if no-one
429 else should try to handle this url. Return None if you can't
430 but another Handler might.
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000431 """
Jeremy Hylton828023b2003-05-04 23:44:49 +0000432 m = req.get_method()
433 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
Martin v. Löwis162f0812003-07-12 07:33:32 +0000434 or code in (301, 302, 303) and m == "POST"):
435 # Strictly (according to RFC 2616), 301 or 302 in response
436 # to a POST MUST NOT cause a redirection without confirmation
Jeremy Hylton828023b2003-05-04 23:44:49 +0000437 # from the user (of urllib2, in this case). In practice,
438 # essentially all clients do redirect in this case, so we
439 # do the same.
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000440 return Request(newurl, headers=req.headers)
441 else:
Martin v. Löwise3b67bc2003-06-14 05:51:25 +0000442 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000443
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000444 # Implementation note: To avoid the server sending us into an
445 # infinite loop, the request object needs to track what URLs we
446 # have already seen. Do this by adding a handler-specific
447 # attribute to the Request object.
448 def http_error_302(self, req, fp, code, msg, headers):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000449 if 'location' in headers:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000450 newurl = headers['location']
Raymond Hettinger54f02222002-06-01 14:18:47 +0000451 elif 'uri' in headers:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000452 newurl = headers['uri']
453 else:
454 return
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000455 newurl = urlparse.urljoin(req.get_full_url(), newurl)
456
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000457 # XXX Probably want to forget about the state of the current
458 # request, although that might interact poorly with other
459 # handlers that also use handler-specific request attributes
Jeremy Hylton03892952003-05-05 04:09:13 +0000460 new = self.redirect_request(req, fp, code, msg, headers, newurl)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000461 if new is None:
462 return
463
464 # loop detection
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000465 new.error_302_dict = {}
466 if hasattr(req, 'error_302_dict'):
Guido van Rossum2d996c02001-04-15 13:08:01 +0000467 if len(req.error_302_dict)>10 or \
Raymond Hettinger54f02222002-06-01 14:18:47 +0000468 newurl in req.error_302_dict:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000469 raise HTTPError(req.get_full_url(), code,
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000470 self.inf_msg + msg, headers, fp)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000471 new.error_302_dict.update(req.error_302_dict)
472 new.error_302_dict[newurl] = newurl
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000473
474 # Don't close the fp until we are sure that we won't use it
Tim Petersab9ba272001-08-09 21:40:30 +0000475 # with HTTPError.
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000476 fp.read()
477 fp.close()
478
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000479 return self.parent.open(new)
480
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000481 http_error_301 = http_error_303 = http_error_307 = http_error_302
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000482
Martin v. Löwis162f0812003-07-12 07:33:32 +0000483 inf_msg = "The HTTP server returned a redirect error that would " \
Thomas Wouters7e474022000-07-16 12:04:32 +0000484 "lead to an infinite loop.\n" \
Martin v. Löwis162f0812003-07-12 07:33:32 +0000485 "The last 30x error message was:\n"
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000486
487class ProxyHandler(BaseHandler):
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000488 # Proxies must be in front
489 handler_order = 100
490
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000491 def __init__(self, proxies=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000492 if proxies is None:
493 proxies = getproxies()
494 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
495 self.proxies = proxies
Brett Cannondf0d87a2003-05-18 02:25:07 +0000496 for type, url in proxies.items():
Tim Peterse1190062001-01-15 03:34:38 +0000497 setattr(self, '%s_open' % type,
Fred Drake13a2c272000-02-10 17:17:14 +0000498 lambda r, proxy=url, type=type, meth=self.proxy_open: \
499 meth(r, proxy, type))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000500
501 def proxy_open(self, req, proxy, type):
Fred Drake13a2c272000-02-10 17:17:14 +0000502 orig_type = req.get_type()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000503 type, r_type = splittype(proxy)
504 host, XXX = splithost(r_type)
505 if '@' in host:
506 user_pass, host = host.split('@', 1)
Jeremy Hylton144dea32002-07-07 16:57:35 +0000507 if ':' in user_pass:
508 user, password = user_pass.split(':', 1)
Tim Petersc411dba2002-07-16 21:35:23 +0000509 user_pass = base64.encodestring('%s:%s' % (unquote(user),
Jeremy Hylton144dea32002-07-07 16:57:35 +0000510 unquote(password)))
Brett Cannon783eaf42003-06-17 21:52:34 +0000511 req.add_header('Proxy-authorization', 'Basic ' + user_pass)
Moshe Zadka8a18e992001-03-01 08:40:42 +0000512 host = unquote(host)
513 req.set_proxy(host, type)
Fred Drake13a2c272000-02-10 17:17:14 +0000514 if orig_type == type:
515 # let other handlers take care of it
516 # XXX this only makes sense if the proxy is before the
517 # other handlers
518 return None
519 else:
520 # need to start over, because the other handlers don't
521 # grok the proxy's URL type
522 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000523
524# feature suggested by Duncan Booth
525# XXX custom is not a good name
526class CustomProxy:
527 # either pass a function to the constructor or override handle
528 def __init__(self, proto, func=None, proxy_addr=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000529 self.proto = proto
530 self.func = func
531 self.addr = proxy_addr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000532
533 def handle(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000534 if self.func and self.func(req):
535 return 1
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000536
537 def get_proxy(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000538 return self.addr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000539
540class CustomProxyHandler(BaseHandler):
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000541 # Proxies must be in front
542 handler_order = 100
543
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000544 def __init__(self, *proxies):
Fred Drake13a2c272000-02-10 17:17:14 +0000545 self.proxies = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000546
547 def proxy_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000548 proto = req.get_type()
549 try:
550 proxies = self.proxies[proto]
551 except KeyError:
552 return None
553 for p in proxies:
554 if p.handle(req):
555 req.set_proxy(p.get_proxy())
556 return self.parent.open(req)
557 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000558
559 def do_proxy(self, p, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000560 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000561
562 def add_proxy(self, cpo):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000563 if cpo.proto in self.proxies:
Fred Drake13a2c272000-02-10 17:17:14 +0000564 self.proxies[cpo.proto].append(cpo)
565 else:
566 self.proxies[cpo.proto] = [cpo]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000567
568class HTTPPasswordMgr:
569 def __init__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000570 self.passwd = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000571
572 def add_password(self, realm, uri, user, passwd):
Fred Drake13a2c272000-02-10 17:17:14 +0000573 # uri could be a single URI or a sequence
Walter Dörwald65230a22002-06-03 15:58:32 +0000574 if isinstance(uri, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000575 uri = [uri]
576 uri = tuple(map(self.reduce_uri, uri))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000577 if not realm in self.passwd:
Fred Drake13a2c272000-02-10 17:17:14 +0000578 self.passwd[realm] = {}
579 self.passwd[realm][uri] = (user, passwd)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000580
581 def find_user_password(self, realm, authuri):
Fred Drake13a2c272000-02-10 17:17:14 +0000582 domains = self.passwd.get(realm, {})
583 authuri = self.reduce_uri(authuri)
Brett Cannon86503b12003-05-12 07:29:42 +0000584 for uris, authinfo in domains.iteritems():
Fred Drake13a2c272000-02-10 17:17:14 +0000585 for uri in uris:
586 if self.is_suburi(uri, authuri):
587 return authinfo
588 return None, None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000589
590 def reduce_uri(self, uri):
Fred Drake13a2c272000-02-10 17:17:14 +0000591 """Accept netloc or URI and extract only the netloc and path"""
592 parts = urlparse.urlparse(uri)
593 if parts[1]:
594 return parts[1], parts[2] or '/'
595 else:
596 return parts[2], '/'
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000597
598 def is_suburi(self, base, test):
Fred Drake13a2c272000-02-10 17:17:14 +0000599 """Check if test is below base in a URI tree
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000600
Fred Drake13a2c272000-02-10 17:17:14 +0000601 Both args must be URIs in reduced form.
602 """
603 if base == test:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000604 return True
Fred Drake13a2c272000-02-10 17:17:14 +0000605 if base[0] != test[0]:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000606 return False
Moshe Zadka8a18e992001-03-01 08:40:42 +0000607 common = posixpath.commonprefix((base[1], test[1]))
Fred Drake13a2c272000-02-10 17:17:14 +0000608 if len(common) == len(base[1]):
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000609 return True
610 return False
Tim Peterse1190062001-01-15 03:34:38 +0000611
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000612
Moshe Zadka8a18e992001-03-01 08:40:42 +0000613class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
614
615 def find_user_password(self, realm, authuri):
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000616 user, password = HTTPPasswordMgr.find_user_password(self, realm,
617 authuri)
Moshe Zadka8a18e992001-03-01 08:40:42 +0000618 if user is not None:
619 return user, password
620 return HTTPPasswordMgr.find_user_password(self, None, authuri)
621
622
623class AbstractBasicAuthHandler:
624
Neal Norwitz853ddd52002-10-09 23:17:04 +0000625 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000626
627 # XXX there can actually be multiple auth-schemes in a
628 # www-authenticate header. should probably be a lot more careful
629 # in parsing them to extract multiple alternatives
630
Moshe Zadka8a18e992001-03-01 08:40:42 +0000631 def __init__(self, password_mgr=None):
632 if password_mgr is None:
633 password_mgr = HTTPPasswordMgr()
634 self.passwd = password_mgr
Fred Drake13a2c272000-02-10 17:17:14 +0000635 self.add_password = self.passwd.add_password
Tim Peterse1190062001-01-15 03:34:38 +0000636
Moshe Zadka8a18e992001-03-01 08:40:42 +0000637 def http_error_auth_reqed(self, authreq, host, req, headers):
638 # XXX could be multiple headers
639 authreq = headers.get(authreq, None)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000640 if authreq:
Moshe Zadka8a18e992001-03-01 08:40:42 +0000641 mo = AbstractBasicAuthHandler.rx.match(authreq)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000642 if mo:
643 scheme, realm = mo.groups()
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000644 if scheme.lower() == 'basic':
Moshe Zadka8a18e992001-03-01 08:40:42 +0000645 return self.retry_http_basic_auth(host, req, realm)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000646
Moshe Zadka8a18e992001-03-01 08:40:42 +0000647 def retry_http_basic_auth(self, host, req, realm):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000648 user,pw = self.passwd.find_user_password(realm, host)
649 if pw:
Fred Drake13a2c272000-02-10 17:17:14 +0000650 raw = "%s:%s" % (user, pw)
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000651 auth = 'Basic %s' % base64.encodestring(raw).strip()
652 if req.headers.get(self.auth_header, None) == auth:
653 return None
654 req.add_header(self.auth_header, auth)
655 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000656 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000657 return None
658
Moshe Zadka8a18e992001-03-01 08:40:42 +0000659class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000660
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000661 auth_header = 'Authorization'
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000662
Moshe Zadka8a18e992001-03-01 08:40:42 +0000663 def http_error_401(self, req, fp, code, msg, headers):
664 host = urlparse.urlparse(req.get_full_url())[1]
Tim Peters30edd232001-03-16 08:29:48 +0000665 return self.http_error_auth_reqed('www-authenticate',
Moshe Zadka8a18e992001-03-01 08:40:42 +0000666 host, req, headers)
667
668
669class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
670
Brett Cannon783eaf42003-06-17 21:52:34 +0000671 auth_header = 'Proxy-authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000672
673 def http_error_407(self, req, fp, code, msg, headers):
674 host = req.get_host()
Tim Peters30edd232001-03-16 08:29:48 +0000675 return self.http_error_auth_reqed('proxy-authenticate',
Moshe Zadka8a18e992001-03-01 08:40:42 +0000676 host, req, headers)
677
678
679class AbstractDigestAuthHandler:
680
681 def __init__(self, passwd=None):
682 if passwd is None:
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000683 passwd = HTTPPasswordMgr()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000684 self.passwd = passwd
Fred Drake13a2c272000-02-10 17:17:14 +0000685 self.add_password = self.passwd.add_password
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000686
Moshe Zadka8a18e992001-03-01 08:40:42 +0000687 def http_error_auth_reqed(self, authreq, host, req, headers):
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000688 authreq = headers.get(self.auth_header, None)
Fred Drake13a2c272000-02-10 17:17:14 +0000689 if authreq:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000690 kind = authreq.split()[0]
Fred Drake13a2c272000-02-10 17:17:14 +0000691 if kind == 'Digest':
692 return self.retry_http_digest_auth(req, authreq)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000693
694 def retry_http_digest_auth(self, req, auth):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000695 token, challenge = auth.split(' ', 1)
Fred Drake13a2c272000-02-10 17:17:14 +0000696 chal = parse_keqv_list(parse_http_list(challenge))
697 auth = self.get_authorization(req, chal)
698 if auth:
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000699 auth_val = 'Digest %s' % auth
700 if req.headers.get(self.auth_header, None) == auth_val:
701 return None
702 req.add_header(self.auth_header, auth_val)
Fred Drake13a2c272000-02-10 17:17:14 +0000703 resp = self.parent.open(req)
Fred Drake13a2c272000-02-10 17:17:14 +0000704 return resp
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000705
706 def get_authorization(self, req, chal):
Fred Drake13a2c272000-02-10 17:17:14 +0000707 try:
708 realm = chal['realm']
709 nonce = chal['nonce']
710 algorithm = chal.get('algorithm', 'MD5')
711 # mod_digest doesn't send an opaque, even though it isn't
712 # supposed to be optional
713 opaque = chal.get('opaque', None)
714 except KeyError:
715 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000716
Fred Drake13a2c272000-02-10 17:17:14 +0000717 H, KD = self.get_algorithm_impls(algorithm)
718 if H is None:
719 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000720
Fred Drake13a2c272000-02-10 17:17:14 +0000721 user, pw = self.passwd.find_user_password(realm,
Tim Peterse1190062001-01-15 03:34:38 +0000722 req.get_full_url())
Fred Drake13a2c272000-02-10 17:17:14 +0000723 if user is None:
724 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000725
Fred Drake13a2c272000-02-10 17:17:14 +0000726 # XXX not implemented yet
727 if req.has_data():
728 entdig = self.get_entity_digest(req.get_data(), chal)
729 else:
730 entdig = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000731
Fred Drake13a2c272000-02-10 17:17:14 +0000732 A1 = "%s:%s:%s" % (user, realm, pw)
733 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
734 # XXX selector: what about proxies and full urls
735 req.get_selector())
736 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
737 # XXX should the partial digests be encoded too?
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000738
Fred Drake13a2c272000-02-10 17:17:14 +0000739 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
740 'response="%s"' % (user, realm, nonce, req.get_selector(),
741 respdig)
742 if opaque:
743 base = base + ', opaque="%s"' % opaque
744 if entdig:
745 base = base + ', digest="%s"' % entdig
746 if algorithm != 'MD5':
747 base = base + ', algorithm="%s"' % algorithm
748 return base
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000749
750 def get_algorithm_impls(self, algorithm):
Fred Drake13a2c272000-02-10 17:17:14 +0000751 # lambdas assume digest modules are imported at the top level
752 if algorithm == 'MD5':
753 H = lambda x, e=encode_digest:e(md5.new(x).digest())
754 elif algorithm == 'SHA':
755 H = lambda x, e=encode_digest:e(sha.new(x).digest())
756 # XXX MD5-sess
757 KD = lambda s, d, H=H: H("%s:%s" % (s, d))
758 return H, KD
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000759
760 def get_entity_digest(self, data, chal):
Fred Drake13a2c272000-02-10 17:17:14 +0000761 # XXX not implemented yet
762 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000763
Moshe Zadka8a18e992001-03-01 08:40:42 +0000764
765class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
766 """An authentication protocol defined by RFC 2069
767
768 Digest authentication improves on basic authentication because it
769 does not transmit passwords in the clear.
770 """
771
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000772 auth_header = 'Authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000773
774 def http_error_401(self, req, fp, code, msg, headers):
775 host = urlparse.urlparse(req.get_full_url())[1]
776 self.http_error_auth_reqed('www-authenticate', host, req, headers)
777
778
779class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
780
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000781 auth_header = 'Proxy-Authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000782
783 def http_error_407(self, req, fp, code, msg, headers):
784 host = req.get_host()
785 self.http_error_auth_reqed('proxy-authenticate', host, req, headers)
786
787
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000788def encode_digest(digest):
789 hexrep = []
790 for c in digest:
Fred Drake13a2c272000-02-10 17:17:14 +0000791 n = (ord(c) >> 4) & 0xf
792 hexrep.append(hex(n)[-1])
793 n = ord(c) & 0xf
794 hexrep.append(hex(n)[-1])
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000795 return ''.join(hexrep)
Tim Peterse1190062001-01-15 03:34:38 +0000796
797
Moshe Zadka8a18e992001-03-01 08:40:42 +0000798class AbstractHTTPHandler(BaseHandler):
799
Jeremy Hylton828023b2003-05-04 23:44:49 +0000800 # XXX Should rewrite do_open() to use the new httplib interface,
801 # would would be a little simpler.
802
Moshe Zadka8a18e992001-03-01 08:40:42 +0000803 def do_open(self, http_class, req):
Moshe Zadka76676802001-04-11 07:44:53 +0000804 host = req.get_host()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000805 if not host:
806 raise URLError('no host given')
807
Jeremy Hylton828023b2003-05-04 23:44:49 +0000808 h = http_class(host) # will parse host:port
809 if req.has_data():
810 data = req.get_data()
811 h.putrequest('POST', req.get_selector())
812 if not 'Content-type' in req.headers:
813 h.putheader('Content-type',
814 'application/x-www-form-urlencoded')
815 if not 'Content-length' in req.headers:
816 h.putheader('Content-length', '%d' % len(data))
817 else:
818 h.putrequest('GET', req.get_selector())
Tim Peterse1190062001-01-15 03:34:38 +0000819
Jeremy Hylton144dea32002-07-07 16:57:35 +0000820 scheme, sel = splittype(req.get_selector())
821 sel_host, sel_path = splithost(sel)
822 h.putheader('Host', sel_host or host)
Brett Cannon783eaf42003-06-17 21:52:34 +0000823 for name, value in self.parent.addheaders:
824 name = name.capitalize()
Jeremy Hylton96f11292002-10-11 17:26:46 +0000825 if name not in req.headers:
Brett Cannon783eaf42003-06-17 21:52:34 +0000826 h.putheader(name, value)
Brett Cannondf0d87a2003-05-18 02:25:07 +0000827 for k, v in req.headers.items():
Fred Drake13a2c272000-02-10 17:17:14 +0000828 h.putheader(k, v)
Jeremy Hyltonf6b444e2003-05-05 01:47:13 +0000829 # httplib will attempt to connect() here. be prepared
830 # to convert a socket error to a URLError.
Jeremy Hylton828023b2003-05-04 23:44:49 +0000831 try:
832 h.endheaders()
833 except socket.error, err:
834 raise URLError(err)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000835 if req.has_data():
Fred Drakeec3dfde2001-07-04 05:18:29 +0000836 h.send(data)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000837
838 code, msg, hdrs = h.getreply()
839 fp = h.getfile()
840 if code == 200:
841 return addinfourl(fp, hdrs, req.get_full_url())
842 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000843 return self.parent.error('http', req, fp, code, msg, hdrs)
844
Moshe Zadka8a18e992001-03-01 08:40:42 +0000845
846class HTTPHandler(AbstractHTTPHandler):
847
848 def http_open(self, req):
849 return self.do_open(httplib.HTTP, req)
850
851
852if hasattr(httplib, 'HTTPS'):
853 class HTTPSHandler(AbstractHTTPHandler):
854
855 def https_open(self, req):
856 return self.do_open(httplib.HTTPS, req)
857
858
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000859class UnknownHandler(BaseHandler):
860 def unknown_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000861 type = req.get_type()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000862 raise URLError('unknown url type: %s' % type)
863
864def parse_keqv_list(l):
865 """Parse list of key=value strings where keys are not duplicated."""
866 parsed = {}
867 for elt in l:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000868 k, v = elt.split('=', 1)
Fred Drake13a2c272000-02-10 17:17:14 +0000869 if v[0] == '"' and v[-1] == '"':
870 v = v[1:-1]
871 parsed[k] = v
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000872 return parsed
873
874def parse_http_list(s):
875 """Parse lists as described by RFC 2068 Section 2.
876
877 In particular, parse comman-separated lists where the elements of
878 the list may include quoted-strings. A quoted-string could
879 contain a comma.
880 """
881 # XXX this function could probably use more testing
882
883 list = []
884 end = len(s)
885 i = 0
886 inquote = 0
887 start = 0
888 while i < end:
Fred Drake13a2c272000-02-10 17:17:14 +0000889 cur = s[i:]
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000890 c = cur.find(',')
891 q = cur.find('"')
Fred Drake13a2c272000-02-10 17:17:14 +0000892 if c == -1:
893 list.append(s[start:])
894 break
895 if q == -1:
896 if inquote:
897 raise ValueError, "unbalanced quotes"
898 else:
899 list.append(s[start:i+c])
900 i = i + c + 1
901 continue
902 if inquote:
903 if q < c:
904 list.append(s[start:i+c])
905 i = i + c + 1
906 start = i
907 inquote = 0
908 else:
Tim Peterse1190062001-01-15 03:34:38 +0000909 i = i + q
Fred Drake13a2c272000-02-10 17:17:14 +0000910 else:
911 if c < q:
912 list.append(s[start:i+c])
913 i = i + c + 1
914 start = i
915 else:
916 inquote = 1
917 i = i + q + 1
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000918 return map(lambda x: x.strip(), list)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000919
920class FileHandler(BaseHandler):
921 # Use local file or FTP depending on form of URL
922 def file_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000923 url = req.get_selector()
924 if url[:2] == '//' and url[2:3] != '/':
925 req.type = 'ftp'
926 return self.parent.open(req)
927 else:
928 return self.open_local_file(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000929
930 # names for the localhost
931 names = None
932 def get_names(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000933 if FileHandler.names is None:
Tim Peterse1190062001-01-15 03:34:38 +0000934 FileHandler.names = (socket.gethostbyname('localhost'),
Fred Drake13a2c272000-02-10 17:17:14 +0000935 socket.gethostbyname(socket.gethostname()))
936 return FileHandler.names
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000937
938 # not entirely sure what the rules are here
939 def open_local_file(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000940 host = req.get_host()
941 file = req.get_selector()
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000942 localfile = url2pathname(file)
943 stats = os.stat(localfile)
Martin v. Löwis9d3eba82002-03-18 08:37:19 +0000944 size = stats.st_size
945 modified = rfc822.formatdate(stats.st_mtime)
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000946 mtype = mimetypes.guess_type(file)[0]
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000947 headers = mimetools.Message(StringIO(
Brett Cannon783eaf42003-06-17 21:52:34 +0000948 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000949 (mtype or 'text/plain', size, modified)))
Fred Drake13a2c272000-02-10 17:17:14 +0000950 if host:
951 host, port = splitport(host)
952 if not host or \
953 (not port and socket.gethostbyname(host) in self.get_names()):
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000954 return addinfourl(open(localfile, 'rb'),
Fred Drake13a2c272000-02-10 17:17:14 +0000955 headers, 'file:'+file)
956 raise URLError('file not on local host')
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000957
958class FTPHandler(BaseHandler):
959 def ftp_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000960 host = req.get_host()
961 if not host:
962 raise IOError, ('ftp error', 'no host given')
963 # XXX handle custom username & password
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000964 try:
965 host = socket.gethostbyname(host)
966 except socket.error, msg:
967 raise URLError(msg)
Fred Drake13a2c272000-02-10 17:17:14 +0000968 host, port = splitport(host)
969 if port is None:
970 port = ftplib.FTP_PORT
971 path, attrs = splitattr(req.get_selector())
972 path = unquote(path)
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000973 dirs = path.split('/')
Fred Drake13a2c272000-02-10 17:17:14 +0000974 dirs, file = dirs[:-1], dirs[-1]
975 if dirs and not dirs[0]:
976 dirs = dirs[1:]
977 user = passwd = '' # XXX
978 try:
979 fw = self.connect_ftp(user, passwd, host, port, dirs)
980 type = file and 'I' or 'D'
981 for attr in attrs:
982 attr, value = splitattr(attr)
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000983 if attr.lower() == 'type' and \
Fred Drake13a2c272000-02-10 17:17:14 +0000984 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000985 type = value.upper()
Fred Drake13a2c272000-02-10 17:17:14 +0000986 fp, retrlen = fw.retrfile(file, type)
Guido van Rossum833a8d82001-08-24 13:10:13 +0000987 headers = ""
988 mtype = mimetypes.guess_type(req.get_full_url())[0]
989 if mtype:
Brett Cannon783eaf42003-06-17 21:52:34 +0000990 headers += "Content-type: %s\n" % mtype
Fred Drake13a2c272000-02-10 17:17:14 +0000991 if retrlen is not None and retrlen >= 0:
Brett Cannon783eaf42003-06-17 21:52:34 +0000992 headers += "Content-length: %d\n" % retrlen
Guido van Rossum833a8d82001-08-24 13:10:13 +0000993 sf = StringIO(headers)
994 headers = mimetools.Message(sf)
Fred Drake13a2c272000-02-10 17:17:14 +0000995 return addinfourl(fp, headers, req.get_full_url())
996 except ftplib.all_errors, msg:
997 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000998
999 def connect_ftp(self, user, passwd, host, port, dirs):
1000 fw = ftpwrapper(user, passwd, host, port, dirs)
1001## fw.ftp.set_debuglevel(1)
1002 return fw
1003
1004class CacheFTPHandler(FTPHandler):
1005 # XXX would be nice to have pluggable cache strategies
1006 # XXX this stuff is definitely not thread safe
1007 def __init__(self):
1008 self.cache = {}
1009 self.timeout = {}
1010 self.soonest = 0
1011 self.delay = 60
Fred Drake13a2c272000-02-10 17:17:14 +00001012 self.max_conns = 16
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001013
1014 def setTimeout(self, t):
1015 self.delay = t
1016
1017 def setMaxConns(self, m):
Fred Drake13a2c272000-02-10 17:17:14 +00001018 self.max_conns = m
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001019
1020 def connect_ftp(self, user, passwd, host, port, dirs):
1021 key = user, passwd, host, port
Raymond Hettinger54f02222002-06-01 14:18:47 +00001022 if key in self.cache:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001023 self.timeout[key] = time.time() + self.delay
1024 else:
1025 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1026 self.timeout[key] = time.time() + self.delay
Fred Drake13a2c272000-02-10 17:17:14 +00001027 self.check_cache()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001028 return self.cache[key]
1029
1030 def check_cache(self):
Fred Drake13a2c272000-02-10 17:17:14 +00001031 # first check for old ones
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001032 t = time.time()
1033 if self.soonest <= t:
Raymond Hettinger4ec4fa22003-05-23 08:51:51 +00001034 for k, v in self.timeout.items():
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001035 if v < t:
1036 self.cache[k].close()
1037 del self.cache[k]
1038 del self.timeout[k]
1039 self.soonest = min(self.timeout.values())
1040
1041 # then check the size
Fred Drake13a2c272000-02-10 17:17:14 +00001042 if len(self.cache) == self.max_conns:
Brett Cannonc8b188a2003-05-17 19:51:26 +00001043 for k, v in self.timeout.items():
Fred Drake13a2c272000-02-10 17:17:14 +00001044 if v == self.soonest:
1045 del self.cache[k]
1046 del self.timeout[k]
1047 break
1048 self.soonest = min(self.timeout.values())
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001049
1050class GopherHandler(BaseHandler):
1051 def gopher_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +00001052 host = req.get_host()
1053 if not host:
1054 raise GopherError('no host given')
1055 host = unquote(host)
1056 selector = req.get_selector()
1057 type, selector = splitgophertype(selector)
1058 selector, query = splitquery(selector)
1059 selector = unquote(selector)
1060 if query:
1061 query = unquote(query)
1062 fp = gopherlib.send_query(selector, query, host)
1063 else:
1064 fp = gopherlib.send_selector(selector, host)
1065 return addinfourl(fp, noheaders(), req.get_full_url())
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001066
1067#bleck! don't use this yet
1068class OpenerFactory:
1069
1070 default_handlers = [UnknownHandler, HTTPHandler,
Tim Peterse1190062001-01-15 03:34:38 +00001071 HTTPDefaultErrorHandler, HTTPRedirectHandler,
Fred Drake13a2c272000-02-10 17:17:14 +00001072 FTPHandler, FileHandler]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001073 handlers = []
1074 replacement_handlers = []
1075
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001076 def add_handler(self, h):
Fred Drake13a2c272000-02-10 17:17:14 +00001077 self.handlers = self.handlers + [h]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001078
1079 def replace_handler(self, h):
Fred Drake13a2c272000-02-10 17:17:14 +00001080 pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001081
1082 def build_opener(self):
Jeremy Hylton54e99e82001-08-07 21:12:25 +00001083 opener = OpenerDirector()
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +00001084 for ph in self.default_handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001085 if inspect.isclass(ph):
Fred Drake13a2c272000-02-10 17:17:14 +00001086 ph = ph()
1087 opener.add_handler(ph)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001088
1089if __name__ == "__main__":
Tim Peterse1190062001-01-15 03:34:38 +00001090 # XXX some of the test code depends on machine configurations that
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001091 # are internal to CNRI. Need to set up a public server with the
1092 # right authentication configuration for test purposes.
1093 if socket.gethostname() == 'bitdiddle':
1094 localhost = 'bitdiddle.cnri.reston.va.us'
Jeremy Hylton73574ee2000-10-12 18:54:18 +00001095 elif socket.gethostname() == 'bitdiddle.concentric.net':
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001096 localhost = 'localhost'
1097 else:
1098 localhost = None
1099 urls = [
Fred Drake13a2c272000-02-10 17:17:14 +00001100 # Thanks to Fred for finding these!
1101 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1102 'gopher://gopher.vt.edu:10010/10/33',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001103
Fred Drake13a2c272000-02-10 17:17:14 +00001104 'file:/etc/passwd',
1105 'file://nonsensename/etc/passwd',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001106 'ftp://www.python.org/pub/python/misc/sousa.au',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001107 'ftp://www.python.org/pub/tmp/blat',
Fred Drake13a2c272000-02-10 17:17:14 +00001108 'http://www.espn.com/', # redirect
1109 'http://www.python.org/Spanish/Inquistion/',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001110 ('http://www.python.org/cgi-bin/faqw.py',
Fred Drake13a2c272000-02-10 17:17:14 +00001111 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1112 'http://www.python.org/',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001113 'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001114 ]
1115
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001116## if localhost is not None:
1117## urls = urls + [
1118## 'file://%s/etc/passwd' % localhost,
1119## 'http://%s/simple/' % localhost,
1120## 'http://%s/digest/' % localhost,
1121## 'http://%s/not/found.h' % localhost,
1122## ]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001123
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001124## bauth = HTTPBasicAuthHandler()
1125## bauth.add_password('basic_test_realm', localhost, 'jhylton',
1126## 'password')
1127## dauth = HTTPDigestAuthHandler()
1128## dauth.add_password('digest_test_realm', localhost, 'jhylton',
1129## 'password')
Tim Peterse1190062001-01-15 03:34:38 +00001130
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001131
1132 cfh = CacheFTPHandler()
1133 cfh.setTimeout(1)
1134
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001135## # XXX try out some custom proxy objects too!
1136## def at_cnri(req):
1137## host = req.get_host()
1138## print host
1139## if host[-18:] == '.cnri.reston.va.us':
1140## return 1
1141## p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1142## ph = CustomProxyHandler(p)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001143
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001144## install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1145 install_opener(build_opener(cfh, GopherHandler))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001146
1147 for url in urls:
Walter Dörwald65230a22002-06-03 15:58:32 +00001148 if isinstance(url, tuple):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001149 url, req = url
1150 else:
1151 req = None
1152 print url
1153 try:
1154 f = urlopen(url, req)
1155 except IOError, err:
Fred Drake13a2c272000-02-10 17:17:14 +00001156 print "IOError:", err
1157 except socket.error, err:
1158 print "socket.error:", err
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001159 else:
1160 buf = f.read()
1161 f.close()
1162 print "read %d bytes" % len(buf)
1163 print
1164 time.sleep(0.1)