blob: 7124dfb0557db2c86b2d6964dd51ef6086f32ece [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""An extensible library for opening URLs using a variety of protocols
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00002
3The simplest way to use this module is to call the urlopen function,
Tim Peterse1190062001-01-15 03:34:38 +00004which accepts a string containing a URL or a Request object (described
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00005below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
Jeremy Hyltone1906632002-10-11 17:27:55 +00008The OpenerDirector manages a collection of Handler objects that do
Tim Peterse1190062001-01-15 03:34:38 +00009all the actual work. Each Handler implements a particular protocol or
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000010option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
Raymond Hettinger024aaa12003-04-24 15:32:12 +000014HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000016
17urlopen(url, data=None) -- basic usage is that same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
Tim Peterse1190062001-01-15 03:34:38 +000019get a file-like object back. One difference is that you can also pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000020a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- function that creates a new OpenerDirector instance.
25will install the default handlers. accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. if one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- installs a new opener as the default opener.
31
32objects of interest:
33OpenerDirector --
34
35Request -- an object that encapsulates the state of a request. the
36state can be a simple as the URL. it can also include extra HTTP
37headers, e.g. a User-Agent.
38
39BaseHandler --
40
41exceptions:
42URLError-- a subclass of IOError, individual protocols have their own
43specific subclass
44
Tim Peterse1190062001-01-15 03:34:38 +000045HTTPError-- also a valid HTTP response, so you can treat an HTTP error
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000046as an exceptional event or valid response
47
48internals:
49BaseHandler and parent
50_call_chain conventions
51
52Example usage:
53
54import urllib2
55
56# set up authentication info
57authinfo = urllib2.HTTPBasicAuthHandler()
58authinfo.add_password('realm', 'host', 'username', 'password')
59
Moshe Zadka8a18e992001-03-01 08:40:42 +000060proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
61
Tim Peterse1190062001-01-15 03:34:38 +000062# build a new opener that adds authentication and caching FTP handlers
Moshe Zadka8a18e992001-03-01 08:40:42 +000063opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000064
65# install it
66urllib2.install_opener(opener)
67
68f = urllib2.urlopen('http://www.python.org/')
69
70
71"""
72
73# XXX issues:
74# If an authentication error handler that tries to perform
Fred Draked5214b02001-11-08 17:19:29 +000075# authentication for some reason but fails, how should the error be
76# signalled? The client needs to know the HTTP error code. But if
77# the handler knows that the problem was, e.g., that it didn't know
78# that hash algo that requested in the challenge, it would be good to
79# pass that information along to the client, too.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000080
81# XXX to do:
82# name!
83# documentation (getting there)
84# complex proxies
85# abstract factory for opener
86# ftp errors aren't handled cleanly
87# gopher can return a socket.error
88# check digest against correct (i.e. non-apache) implementation
89
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000090import socket
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000091import httplib
Jeremy Hylton8b78b992001-10-09 16:18:45 +000092import inspect
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000093import re
94import base64
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000095import urlparse
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000096import md5
97import mimetypes
98import mimetools
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +000099import rfc822
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000100import ftplib
101import sys
102import time
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000103import os
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000104import gopherlib
Moshe Zadka8a18e992001-03-01 08:40:42 +0000105import posixpath
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000106
107try:
108 from cStringIO import StringIO
109except ImportError:
110 from StringIO import StringIO
111
112try:
113 import sha
114except ImportError:
115 # need 1.5.2 final
116 sha = None
117
118# not sure how many of these need to be gotten rid of
119from urllib import unwrap, unquote, splittype, splithost, \
120 addinfourl, splitport, splitgophertype, splitquery, \
121 splitattr, ftpwrapper, noheaders
122
123# support for proxies via environment variables
124from urllib import getproxies
125
126# support for FileHandler
Moshe Zadka8a18e992001-03-01 08:40:42 +0000127from urllib import localhost, url2pathname
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000128
129__version__ = "2.0a1"
130
131_opener = None
132def urlopen(url, data=None):
133 global _opener
134 if _opener is None:
135 _opener = build_opener()
136 return _opener.open(url, data)
137
138def install_opener(opener):
139 global _opener
140 _opener = opener
141
142# do these error classes make sense?
Tim Peterse1190062001-01-15 03:34:38 +0000143# make sure all of the IOError stuff is overridden. we just want to be
Fred Drakea87a5212002-08-13 13:59:55 +0000144# subtypes.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000145
146class URLError(IOError):
147 # URLError is a sub-type of IOError, but it doesn't share any of
Jeremy Hylton0a4a50d2003-10-06 05:15:13 +0000148 # the implementation. need to override __init__ and __str__.
149 # It sets self.args for compatibility with other EnvironmentError
150 # subclasses, but args doesn't have the typical format with errno in
151 # slot 0 and strerror in slot 1. This may be better than nothing.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000152 def __init__(self, reason):
Jeremy Hylton0a4a50d2003-10-06 05:15:13 +0000153 self.args = reason,
Fred Drake13a2c272000-02-10 17:17:14 +0000154 self.reason = reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000155
156 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000157 return '<urlopen error %s>' % self.reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000158
159class HTTPError(URLError, addinfourl):
160 """Raised when HTTP error occurs, but also acts like non-error return"""
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000161 __super_init = addinfourl.__init__
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000162
163 def __init__(self, url, code, msg, hdrs, fp):
Fred Drake13a2c272000-02-10 17:17:14 +0000164 self.code = code
165 self.msg = msg
166 self.hdrs = hdrs
167 self.fp = fp
Fred Drake13a2c272000-02-10 17:17:14 +0000168 self.filename = url
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000169 # The addinfourl classes depend on fp being a valid file
170 # object. In some cases, the HTTPError may not have a valid
171 # file object. If this happens, the simplest workaround is to
Tim Petersc411dba2002-07-16 21:35:23 +0000172 # not initialize the base classes.
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000173 if fp is not None:
174 self.__super_init(fp, hdrs, url)
Tim Peterse1190062001-01-15 03:34:38 +0000175
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000176 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000177 return 'HTTP Error %s: %s' % (self.code, self.msg)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000178
179 def __del__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000180 # XXX is this safe? what if user catches exception, then
181 # extracts fp and discards exception?
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000182 if self.fp:
183 self.fp.close()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000184
185class GopherError(URLError):
186 pass
187
Moshe Zadka8a18e992001-03-01 08:40:42 +0000188
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000189class Request:
Moshe Zadka8a18e992001-03-01 08:40:42 +0000190
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000191 def __init__(self, url, data=None, headers={}):
Fred Drake13a2c272000-02-10 17:17:14 +0000192 # unwrap('<URL:type://host/path>') --> 'type://host/path'
193 self.__original = unwrap(url)
194 self.type = None
195 # self.__r_type is what's left after doing the splittype
196 self.host = None
197 self.port = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000198 self.data = data
Fred Drake13a2c272000-02-10 17:17:14 +0000199 self.headers = {}
Brett Cannonc8b188a2003-05-17 19:51:26 +0000200 for key, value in headers.items():
Brett Cannon86503b12003-05-12 07:29:42 +0000201 self.add_header(key, value)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000202
203 def __getattr__(self, attr):
Fred Drake13a2c272000-02-10 17:17:14 +0000204 # XXX this is a fallback mechanism to guard against these
Tim Peterse1190062001-01-15 03:34:38 +0000205 # methods getting called in a non-standard order. this may be
Fred Drake13a2c272000-02-10 17:17:14 +0000206 # too complicated and/or unnecessary.
207 # XXX should the __r_XXX attributes be public?
208 if attr[:12] == '_Request__r_':
209 name = attr[12:]
210 if hasattr(Request, 'get_' + name):
211 getattr(self, 'get_' + name)()
212 return getattr(self, attr)
213 raise AttributeError, attr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000214
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000215 def get_method(self):
216 if self.has_data():
217 return "POST"
218 else:
219 return "GET"
220
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000221 def add_data(self, data):
222 self.data = data
223
224 def has_data(self):
225 return self.data is not None
226
227 def get_data(self):
228 return self.data
229
230 def get_full_url(self):
231 return self.__original
232
233 def get_type(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000234 if self.type is None:
235 self.type, self.__r_type = splittype(self.__original)
Jeremy Hylton78cae612001-05-09 15:49:24 +0000236 if self.type is None:
237 raise ValueError, "unknown url type: %s" % self.__original
Fred Drake13a2c272000-02-10 17:17:14 +0000238 return self.type
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000239
240 def get_host(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000241 if self.host is None:
242 self.host, self.__r_host = splithost(self.__r_type)
243 if self.host:
244 self.host = unquote(self.host)
245 return self.host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000246
247 def get_selector(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000248 return self.__r_host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000249
Moshe Zadka8a18e992001-03-01 08:40:42 +0000250 def set_proxy(self, host, type):
251 self.host, self.type = host, type
Fred Drake13a2c272000-02-10 17:17:14 +0000252 self.__r_host = self.__original
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000253
254 def add_header(self, key, val):
Fred Drake13a2c272000-02-10 17:17:14 +0000255 # useful for something like authentication
Brett Cannon86503b12003-05-12 07:29:42 +0000256 self.headers[key.capitalize()] = val
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000257
258class OpenerDirector:
259 def __init__(self):
260 server_version = "Python-urllib/%s" % __version__
Brett Cannon783eaf42003-06-17 21:52:34 +0000261 self.addheaders = [('User-agent', server_version)]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000262 # manage the individual handlers
263 self.handlers = []
264 self.handle_open = {}
265 self.handle_error = {}
266
267 def add_handler(self, handler):
268 added = 0
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000269 for meth in dir(handler):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000270 if meth[-5:] == '_open':
271 protocol = meth[:-5]
Raymond Hettinger54f02222002-06-01 14:18:47 +0000272 if protocol in self.handle_open:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000273 self.handle_open[protocol].append(handler)
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000274 self.handle_open[protocol].sort()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000275 else:
276 self.handle_open[protocol] = [handler]
277 added = 1
278 continue
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000279 i = meth.find('_')
280 j = meth[i+1:].find('_') + i + 1
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000281 if j != -1 and meth[i+1:j] == 'error':
282 proto = meth[:i]
283 kind = meth[j+1:]
284 try:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000285 kind = int(kind)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000286 except ValueError:
287 pass
288 dict = self.handle_error.get(proto, {})
Raymond Hettinger54f02222002-06-01 14:18:47 +0000289 if kind in dict:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000290 dict[kind].append(handler)
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000291 dict[kind].sort()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000292 else:
293 dict[kind] = [handler]
294 self.handle_error[proto] = dict
295 added = 1
296 continue
297 if added:
298 self.handlers.append(handler)
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000299 self.handlers.sort()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000300 handler.add_parent(self)
Tim Peterse1190062001-01-15 03:34:38 +0000301
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000302 def __del__(self):
303 self.close()
304
305 def close(self):
306 for handler in self.handlers:
307 handler.close()
308 self.handlers = []
309
310 def _call_chain(self, chain, kind, meth_name, *args):
311 # XXX raise an exception if no one else should try to handle
312 # this url. return None if you can't but someone else could.
313 handlers = chain.get(kind, ())
314 for handler in handlers:
315 func = getattr(handler, meth_name)
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000316
317 result = func(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000318 if result is not None:
319 return result
320
321 def open(self, fullurl, data=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000322 # accept a URL or a Request object
Walter Dörwald65230a22002-06-03 15:58:32 +0000323 if isinstance(fullurl, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000324 req = Request(fullurl, data)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000325 else:
326 req = fullurl
327 if data is not None:
328 req.add_data(data)
Tim Peterse1190062001-01-15 03:34:38 +0000329
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000330 result = self._call_chain(self.handle_open, 'default',
Tim Peterse1190062001-01-15 03:34:38 +0000331 'default_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000332 if result:
333 return result
334
Fred Drake13a2c272000-02-10 17:17:14 +0000335 type_ = req.get_type()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000336 result = self._call_chain(self.handle_open, type_, type_ + \
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000337 '_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000338 if result:
339 return result
340
341 return self._call_chain(self.handle_open, 'unknown',
342 'unknown_open', req)
343
344 def error(self, proto, *args):
Moshe Zadka8a18e992001-03-01 08:40:42 +0000345 if proto in ['http', 'https']:
Fred Draked5214b02001-11-08 17:19:29 +0000346 # XXX http[s] protocols are special-cased
347 dict = self.handle_error['http'] # https is not different than http
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000348 proto = args[2] # YUCK!
349 meth_name = 'http_error_%d' % proto
350 http_err = 1
351 orig_args = args
352 else:
353 dict = self.handle_error
354 meth_name = proto + '_error'
355 http_err = 0
356 args = (dict, proto, meth_name) + args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000357 result = self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000358 if result:
359 return result
360
361 if http_err:
362 args = (dict, 'default', 'http_error_default') + orig_args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000363 return self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000364
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000365# XXX probably also want an abstract factory that knows when it makes
366# sense to skip a superclass in favor of a subclass and when it might
367# make sense to include both
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000368
369def build_opener(*handlers):
370 """Create an opener object from a list of handlers.
371
372 The opener will use several default handlers, including support
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000373 for HTTP and FTP.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000374
375 If any of the handlers passed as arguments are subclasses of the
376 default handlers, the default handlers will not be used.
377 """
Tim Peterse1190062001-01-15 03:34:38 +0000378
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000379 opener = OpenerDirector()
380 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
381 HTTPDefaultErrorHandler, HTTPRedirectHandler,
382 FTPHandler, FileHandler]
Moshe Zadka8a18e992001-03-01 08:40:42 +0000383 if hasattr(httplib, 'HTTPS'):
384 default_classes.append(HTTPSHandler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000385 skip = []
386 for klass in default_classes:
387 for check in handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000388 if inspect.isclass(check):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000389 if issubclass(check, klass):
390 skip.append(klass)
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000391 elif isinstance(check, klass):
392 skip.append(klass)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000393 for klass in skip:
394 default_classes.remove(klass)
395
396 for klass in default_classes:
397 opener.add_handler(klass())
398
399 for h in handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000400 if inspect.isclass(h):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000401 h = h()
402 opener.add_handler(h)
403 return opener
404
405class BaseHandler:
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000406 handler_order = 500
407
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000408 def add_parent(self, parent):
409 self.parent = parent
410 def close(self):
411 self.parent = None
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000412 def __lt__(self, other):
413 if not hasattr(other, "handler_order"):
414 # Try to preserve the old behavior of having custom classes
415 # inserted after default ones (works only for custom user
416 # classes which are not aware of handler_order).
417 return True
418 return self.handler_order < other.handler_order
Tim Petersf545baa2003-06-15 23:26:30 +0000419
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000420
421class HTTPDefaultErrorHandler(BaseHandler):
422 def http_error_default(self, req, fp, code, msg, hdrs):
Fred Drake13a2c272000-02-10 17:17:14 +0000423 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000424
425class HTTPRedirectHandler(BaseHandler):
Jeremy Hylton03892952003-05-05 04:09:13 +0000426 def redirect_request(self, req, fp, code, msg, headers, newurl):
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000427 """Return a Request or None in response to a redirect.
428
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000429 This is called by the http_error_30x methods when a
430 redirection response is received. If a redirection should
431 take place, return a new Request to allow http_error_30x to
432 perform the redirect. Otherwise, raise HTTPError if no-one
433 else should try to handle this url. Return None if you can't
434 but another Handler might.
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000435 """
Jeremy Hylton828023b2003-05-04 23:44:49 +0000436 m = req.get_method()
437 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
Martin v. Löwis162f0812003-07-12 07:33:32 +0000438 or code in (301, 302, 303) and m == "POST"):
439 # Strictly (according to RFC 2616), 301 or 302 in response
440 # to a POST MUST NOT cause a redirection without confirmation
Jeremy Hylton828023b2003-05-04 23:44:49 +0000441 # from the user (of urllib2, in this case). In practice,
442 # essentially all clients do redirect in this case, so we
443 # do the same.
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000444 return Request(newurl, headers=req.headers)
445 else:
Martin v. Löwise3b67bc2003-06-14 05:51:25 +0000446 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000447
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000448 # Implementation note: To avoid the server sending us into an
449 # infinite loop, the request object needs to track what URLs we
450 # have already seen. Do this by adding a handler-specific
451 # attribute to the Request object.
452 def http_error_302(self, req, fp, code, msg, headers):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000453 if 'location' in headers:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000454 newurl = headers['location']
Raymond Hettinger54f02222002-06-01 14:18:47 +0000455 elif 'uri' in headers:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000456 newurl = headers['uri']
457 else:
458 return
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000459 newurl = urlparse.urljoin(req.get_full_url(), newurl)
460
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000461 # XXX Probably want to forget about the state of the current
462 # request, although that might interact poorly with other
463 # handlers that also use handler-specific request attributes
Jeremy Hylton03892952003-05-05 04:09:13 +0000464 new = self.redirect_request(req, fp, code, msg, headers, newurl)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000465 if new is None:
466 return
467
468 # loop detection
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000469 new.error_302_dict = {}
470 if hasattr(req, 'error_302_dict'):
Guido van Rossum2d996c02001-04-15 13:08:01 +0000471 if len(req.error_302_dict)>10 or \
Raymond Hettinger54f02222002-06-01 14:18:47 +0000472 newurl in req.error_302_dict:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000473 raise HTTPError(req.get_full_url(), code,
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000474 self.inf_msg + msg, headers, fp)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000475 new.error_302_dict.update(req.error_302_dict)
476 new.error_302_dict[newurl] = newurl
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000477
478 # Don't close the fp until we are sure that we won't use it
Tim Petersab9ba272001-08-09 21:40:30 +0000479 # with HTTPError.
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000480 fp.read()
481 fp.close()
482
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000483 return self.parent.open(new)
484
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000485 http_error_301 = http_error_303 = http_error_307 = http_error_302
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000486
Martin v. Löwis162f0812003-07-12 07:33:32 +0000487 inf_msg = "The HTTP server returned a redirect error that would " \
Thomas Wouters7e474022000-07-16 12:04:32 +0000488 "lead to an infinite loop.\n" \
Martin v. Löwis162f0812003-07-12 07:33:32 +0000489 "The last 30x error message was:\n"
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000490
491class ProxyHandler(BaseHandler):
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000492 # Proxies must be in front
493 handler_order = 100
494
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000495 def __init__(self, proxies=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000496 if proxies is None:
497 proxies = getproxies()
498 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
499 self.proxies = proxies
Brett Cannondf0d87a2003-05-18 02:25:07 +0000500 for type, url in proxies.items():
Tim Peterse1190062001-01-15 03:34:38 +0000501 setattr(self, '%s_open' % type,
Fred Drake13a2c272000-02-10 17:17:14 +0000502 lambda r, proxy=url, type=type, meth=self.proxy_open: \
503 meth(r, proxy, type))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000504
505 def proxy_open(self, req, proxy, type):
Fred Drake13a2c272000-02-10 17:17:14 +0000506 orig_type = req.get_type()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000507 type, r_type = splittype(proxy)
508 host, XXX = splithost(r_type)
509 if '@' in host:
510 user_pass, host = host.split('@', 1)
Jeremy Hylton144dea32002-07-07 16:57:35 +0000511 if ':' in user_pass:
512 user, password = user_pass.split(':', 1)
Tim Petersc411dba2002-07-16 21:35:23 +0000513 user_pass = base64.encodestring('%s:%s' % (unquote(user),
Jeremy Hylton144dea32002-07-07 16:57:35 +0000514 unquote(password)))
Brett Cannon783eaf42003-06-17 21:52:34 +0000515 req.add_header('Proxy-authorization', 'Basic ' + user_pass)
Moshe Zadka8a18e992001-03-01 08:40:42 +0000516 host = unquote(host)
517 req.set_proxy(host, type)
Fred Drake13a2c272000-02-10 17:17:14 +0000518 if orig_type == type:
519 # let other handlers take care of it
520 # XXX this only makes sense if the proxy is before the
521 # other handlers
522 return None
523 else:
524 # need to start over, because the other handlers don't
525 # grok the proxy's URL type
526 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000527
528# feature suggested by Duncan Booth
529# XXX custom is not a good name
530class CustomProxy:
531 # either pass a function to the constructor or override handle
532 def __init__(self, proto, func=None, proxy_addr=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000533 self.proto = proto
534 self.func = func
535 self.addr = proxy_addr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000536
537 def handle(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000538 if self.func and self.func(req):
539 return 1
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000540
541 def get_proxy(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000542 return self.addr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000543
544class CustomProxyHandler(BaseHandler):
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000545 # Proxies must be in front
546 handler_order = 100
547
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000548 def __init__(self, *proxies):
Fred Drake13a2c272000-02-10 17:17:14 +0000549 self.proxies = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000550
551 def proxy_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000552 proto = req.get_type()
553 try:
554 proxies = self.proxies[proto]
555 except KeyError:
556 return None
557 for p in proxies:
558 if p.handle(req):
559 req.set_proxy(p.get_proxy())
560 return self.parent.open(req)
561 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000562
563 def do_proxy(self, p, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000564 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000565
566 def add_proxy(self, cpo):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000567 if cpo.proto in self.proxies:
Fred Drake13a2c272000-02-10 17:17:14 +0000568 self.proxies[cpo.proto].append(cpo)
569 else:
570 self.proxies[cpo.proto] = [cpo]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000571
572class HTTPPasswordMgr:
573 def __init__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000574 self.passwd = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000575
576 def add_password(self, realm, uri, user, passwd):
Fred Drake13a2c272000-02-10 17:17:14 +0000577 # uri could be a single URI or a sequence
Walter Dörwald65230a22002-06-03 15:58:32 +0000578 if isinstance(uri, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000579 uri = [uri]
580 uri = tuple(map(self.reduce_uri, uri))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000581 if not realm in self.passwd:
Fred Drake13a2c272000-02-10 17:17:14 +0000582 self.passwd[realm] = {}
583 self.passwd[realm][uri] = (user, passwd)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000584
585 def find_user_password(self, realm, authuri):
Fred Drake13a2c272000-02-10 17:17:14 +0000586 domains = self.passwd.get(realm, {})
587 authuri = self.reduce_uri(authuri)
Brett Cannon86503b12003-05-12 07:29:42 +0000588 for uris, authinfo in domains.iteritems():
Fred Drake13a2c272000-02-10 17:17:14 +0000589 for uri in uris:
590 if self.is_suburi(uri, authuri):
591 return authinfo
592 return None, None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000593
594 def reduce_uri(self, uri):
Fred Drake13a2c272000-02-10 17:17:14 +0000595 """Accept netloc or URI and extract only the netloc and path"""
596 parts = urlparse.urlparse(uri)
597 if parts[1]:
598 return parts[1], parts[2] or '/'
599 else:
600 return parts[2], '/'
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000601
602 def is_suburi(self, base, test):
Fred Drake13a2c272000-02-10 17:17:14 +0000603 """Check if test is below base in a URI tree
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000604
Fred Drake13a2c272000-02-10 17:17:14 +0000605 Both args must be URIs in reduced form.
606 """
607 if base == test:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000608 return True
Fred Drake13a2c272000-02-10 17:17:14 +0000609 if base[0] != test[0]:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000610 return False
Moshe Zadka8a18e992001-03-01 08:40:42 +0000611 common = posixpath.commonprefix((base[1], test[1]))
Fred Drake13a2c272000-02-10 17:17:14 +0000612 if len(common) == len(base[1]):
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000613 return True
614 return False
Tim Peterse1190062001-01-15 03:34:38 +0000615
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000616
Moshe Zadka8a18e992001-03-01 08:40:42 +0000617class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
618
619 def find_user_password(self, realm, authuri):
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000620 user, password = HTTPPasswordMgr.find_user_password(self, realm,
621 authuri)
Moshe Zadka8a18e992001-03-01 08:40:42 +0000622 if user is not None:
623 return user, password
624 return HTTPPasswordMgr.find_user_password(self, None, authuri)
625
626
627class AbstractBasicAuthHandler:
628
Neal Norwitz853ddd52002-10-09 23:17:04 +0000629 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000630
631 # XXX there can actually be multiple auth-schemes in a
632 # www-authenticate header. should probably be a lot more careful
633 # in parsing them to extract multiple alternatives
634
Moshe Zadka8a18e992001-03-01 08:40:42 +0000635 def __init__(self, password_mgr=None):
636 if password_mgr is None:
637 password_mgr = HTTPPasswordMgr()
638 self.passwd = password_mgr
Fred Drake13a2c272000-02-10 17:17:14 +0000639 self.add_password = self.passwd.add_password
Tim Peterse1190062001-01-15 03:34:38 +0000640
Moshe Zadka8a18e992001-03-01 08:40:42 +0000641 def http_error_auth_reqed(self, authreq, host, req, headers):
642 # XXX could be multiple headers
643 authreq = headers.get(authreq, None)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000644 if authreq:
Moshe Zadka8a18e992001-03-01 08:40:42 +0000645 mo = AbstractBasicAuthHandler.rx.match(authreq)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000646 if mo:
647 scheme, realm = mo.groups()
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000648 if scheme.lower() == 'basic':
Moshe Zadka8a18e992001-03-01 08:40:42 +0000649 return self.retry_http_basic_auth(host, req, realm)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000650
Moshe Zadka8a18e992001-03-01 08:40:42 +0000651 def retry_http_basic_auth(self, host, req, realm):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000652 user,pw = self.passwd.find_user_password(realm, host)
653 if pw:
Fred Drake13a2c272000-02-10 17:17:14 +0000654 raw = "%s:%s" % (user, pw)
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000655 auth = 'Basic %s' % base64.encodestring(raw).strip()
656 if req.headers.get(self.auth_header, None) == auth:
657 return None
658 req.add_header(self.auth_header, auth)
659 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000660 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000661 return None
662
Moshe Zadka8a18e992001-03-01 08:40:42 +0000663class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000664
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000665 auth_header = 'Authorization'
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000666
Moshe Zadka8a18e992001-03-01 08:40:42 +0000667 def http_error_401(self, req, fp, code, msg, headers):
668 host = urlparse.urlparse(req.get_full_url())[1]
Tim Peters30edd232001-03-16 08:29:48 +0000669 return self.http_error_auth_reqed('www-authenticate',
Moshe Zadka8a18e992001-03-01 08:40:42 +0000670 host, req, headers)
671
672
673class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
674
Brett Cannon783eaf42003-06-17 21:52:34 +0000675 auth_header = 'Proxy-authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000676
677 def http_error_407(self, req, fp, code, msg, headers):
678 host = req.get_host()
Tim Peters30edd232001-03-16 08:29:48 +0000679 return self.http_error_auth_reqed('proxy-authenticate',
Moshe Zadka8a18e992001-03-01 08:40:42 +0000680 host, req, headers)
681
682
683class AbstractDigestAuthHandler:
684
685 def __init__(self, passwd=None):
686 if passwd is None:
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000687 passwd = HTTPPasswordMgr()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000688 self.passwd = passwd
Fred Drake13a2c272000-02-10 17:17:14 +0000689 self.add_password = self.passwd.add_password
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000690
Moshe Zadka8a18e992001-03-01 08:40:42 +0000691 def http_error_auth_reqed(self, authreq, host, req, headers):
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000692 authreq = headers.get(self.auth_header, None)
Fred Drake13a2c272000-02-10 17:17:14 +0000693 if authreq:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000694 kind = authreq.split()[0]
Fred Drake13a2c272000-02-10 17:17:14 +0000695 if kind == 'Digest':
696 return self.retry_http_digest_auth(req, authreq)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000697
698 def retry_http_digest_auth(self, req, auth):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000699 token, challenge = auth.split(' ', 1)
Fred Drake13a2c272000-02-10 17:17:14 +0000700 chal = parse_keqv_list(parse_http_list(challenge))
701 auth = self.get_authorization(req, chal)
702 if auth:
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000703 auth_val = 'Digest %s' % auth
704 if req.headers.get(self.auth_header, None) == auth_val:
705 return None
706 req.add_header(self.auth_header, auth_val)
Fred Drake13a2c272000-02-10 17:17:14 +0000707 resp = self.parent.open(req)
Fred Drake13a2c272000-02-10 17:17:14 +0000708 return resp
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000709
710 def get_authorization(self, req, chal):
Fred Drake13a2c272000-02-10 17:17:14 +0000711 try:
712 realm = chal['realm']
713 nonce = chal['nonce']
714 algorithm = chal.get('algorithm', 'MD5')
715 # mod_digest doesn't send an opaque, even though it isn't
716 # supposed to be optional
717 opaque = chal.get('opaque', None)
718 except KeyError:
719 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000720
Fred Drake13a2c272000-02-10 17:17:14 +0000721 H, KD = self.get_algorithm_impls(algorithm)
722 if H is None:
723 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000724
Fred Drake13a2c272000-02-10 17:17:14 +0000725 user, pw = self.passwd.find_user_password(realm,
Tim Peterse1190062001-01-15 03:34:38 +0000726 req.get_full_url())
Fred Drake13a2c272000-02-10 17:17:14 +0000727 if user is None:
728 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000729
Fred Drake13a2c272000-02-10 17:17:14 +0000730 # XXX not implemented yet
731 if req.has_data():
732 entdig = self.get_entity_digest(req.get_data(), chal)
733 else:
734 entdig = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000735
Fred Drake13a2c272000-02-10 17:17:14 +0000736 A1 = "%s:%s:%s" % (user, realm, pw)
737 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
738 # XXX selector: what about proxies and full urls
739 req.get_selector())
740 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
741 # XXX should the partial digests be encoded too?
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000742
Fred Drake13a2c272000-02-10 17:17:14 +0000743 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
744 'response="%s"' % (user, realm, nonce, req.get_selector(),
745 respdig)
746 if opaque:
747 base = base + ', opaque="%s"' % opaque
748 if entdig:
749 base = base + ', digest="%s"' % entdig
750 if algorithm != 'MD5':
751 base = base + ', algorithm="%s"' % algorithm
752 return base
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000753
754 def get_algorithm_impls(self, algorithm):
Fred Drake13a2c272000-02-10 17:17:14 +0000755 # lambdas assume digest modules are imported at the top level
756 if algorithm == 'MD5':
757 H = lambda x, e=encode_digest:e(md5.new(x).digest())
758 elif algorithm == 'SHA':
759 H = lambda x, e=encode_digest:e(sha.new(x).digest())
760 # XXX MD5-sess
761 KD = lambda s, d, H=H: H("%s:%s" % (s, d))
762 return H, KD
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000763
764 def get_entity_digest(self, data, chal):
Fred Drake13a2c272000-02-10 17:17:14 +0000765 # XXX not implemented yet
766 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000767
Moshe Zadka8a18e992001-03-01 08:40:42 +0000768
769class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
770 """An authentication protocol defined by RFC 2069
771
772 Digest authentication improves on basic authentication because it
773 does not transmit passwords in the clear.
774 """
775
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000776 auth_header = 'Authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000777
778 def http_error_401(self, req, fp, code, msg, headers):
779 host = urlparse.urlparse(req.get_full_url())[1]
780 self.http_error_auth_reqed('www-authenticate', host, req, headers)
781
782
783class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
784
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000785 auth_header = 'Proxy-Authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000786
787 def http_error_407(self, req, fp, code, msg, headers):
788 host = req.get_host()
789 self.http_error_auth_reqed('proxy-authenticate', host, req, headers)
790
791
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000792def encode_digest(digest):
793 hexrep = []
794 for c in digest:
Fred Drake13a2c272000-02-10 17:17:14 +0000795 n = (ord(c) >> 4) & 0xf
796 hexrep.append(hex(n)[-1])
797 n = ord(c) & 0xf
798 hexrep.append(hex(n)[-1])
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000799 return ''.join(hexrep)
Tim Peterse1190062001-01-15 03:34:38 +0000800
801
Moshe Zadka8a18e992001-03-01 08:40:42 +0000802class AbstractHTTPHandler(BaseHandler):
803
Jeremy Hylton828023b2003-05-04 23:44:49 +0000804 # XXX Should rewrite do_open() to use the new httplib interface,
805 # would would be a little simpler.
806
Moshe Zadka8a18e992001-03-01 08:40:42 +0000807 def do_open(self, http_class, req):
Moshe Zadka76676802001-04-11 07:44:53 +0000808 host = req.get_host()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000809 if not host:
810 raise URLError('no host given')
811
Jeremy Hylton828023b2003-05-04 23:44:49 +0000812 h = http_class(host) # will parse host:port
813 if req.has_data():
814 data = req.get_data()
815 h.putrequest('POST', req.get_selector())
816 if not 'Content-type' in req.headers:
817 h.putheader('Content-type',
818 'application/x-www-form-urlencoded')
819 if not 'Content-length' in req.headers:
820 h.putheader('Content-length', '%d' % len(data))
821 else:
822 h.putrequest('GET', req.get_selector())
Tim Peterse1190062001-01-15 03:34:38 +0000823
Jeremy Hylton144dea32002-07-07 16:57:35 +0000824 scheme, sel = splittype(req.get_selector())
825 sel_host, sel_path = splithost(sel)
826 h.putheader('Host', sel_host or host)
Brett Cannon783eaf42003-06-17 21:52:34 +0000827 for name, value in self.parent.addheaders:
828 name = name.capitalize()
Jeremy Hylton96f11292002-10-11 17:26:46 +0000829 if name not in req.headers:
Brett Cannon783eaf42003-06-17 21:52:34 +0000830 h.putheader(name, value)
Brett Cannondf0d87a2003-05-18 02:25:07 +0000831 for k, v in req.headers.items():
Fred Drake13a2c272000-02-10 17:17:14 +0000832 h.putheader(k, v)
Jeremy Hyltonf6b444e2003-05-05 01:47:13 +0000833 # httplib will attempt to connect() here. be prepared
834 # to convert a socket error to a URLError.
Jeremy Hylton828023b2003-05-04 23:44:49 +0000835 try:
836 h.endheaders()
837 except socket.error, err:
838 raise URLError(err)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000839 if req.has_data():
Fred Drakeec3dfde2001-07-04 05:18:29 +0000840 h.send(data)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000841
842 code, msg, hdrs = h.getreply()
843 fp = h.getfile()
844 if code == 200:
845 return addinfourl(fp, hdrs, req.get_full_url())
846 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000847 return self.parent.error('http', req, fp, code, msg, hdrs)
848
Moshe Zadka8a18e992001-03-01 08:40:42 +0000849
850class HTTPHandler(AbstractHTTPHandler):
851
852 def http_open(self, req):
853 return self.do_open(httplib.HTTP, req)
854
855
856if hasattr(httplib, 'HTTPS'):
857 class HTTPSHandler(AbstractHTTPHandler):
858
859 def https_open(self, req):
860 return self.do_open(httplib.HTTPS, req)
861
862
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000863class UnknownHandler(BaseHandler):
864 def unknown_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000865 type = req.get_type()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000866 raise URLError('unknown url type: %s' % type)
867
868def parse_keqv_list(l):
869 """Parse list of key=value strings where keys are not duplicated."""
870 parsed = {}
871 for elt in l:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000872 k, v = elt.split('=', 1)
Fred Drake13a2c272000-02-10 17:17:14 +0000873 if v[0] == '"' and v[-1] == '"':
874 v = v[1:-1]
875 parsed[k] = v
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000876 return parsed
877
878def parse_http_list(s):
879 """Parse lists as described by RFC 2068 Section 2.
880
881 In particular, parse comman-separated lists where the elements of
882 the list may include quoted-strings. A quoted-string could
883 contain a comma.
884 """
885 # XXX this function could probably use more testing
886
887 list = []
888 end = len(s)
889 i = 0
890 inquote = 0
891 start = 0
892 while i < end:
Fred Drake13a2c272000-02-10 17:17:14 +0000893 cur = s[i:]
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000894 c = cur.find(',')
895 q = cur.find('"')
Fred Drake13a2c272000-02-10 17:17:14 +0000896 if c == -1:
897 list.append(s[start:])
898 break
899 if q == -1:
900 if inquote:
901 raise ValueError, "unbalanced quotes"
902 else:
903 list.append(s[start:i+c])
904 i = i + c + 1
905 continue
906 if inquote:
907 if q < c:
908 list.append(s[start:i+c])
909 i = i + c + 1
910 start = i
911 inquote = 0
912 else:
Tim Peterse1190062001-01-15 03:34:38 +0000913 i = i + q
Fred Drake13a2c272000-02-10 17:17:14 +0000914 else:
915 if c < q:
916 list.append(s[start:i+c])
917 i = i + c + 1
918 start = i
919 else:
920 inquote = 1
921 i = i + q + 1
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000922 return map(lambda x: x.strip(), list)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000923
924class FileHandler(BaseHandler):
925 # Use local file or FTP depending on form of URL
926 def file_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000927 url = req.get_selector()
928 if url[:2] == '//' and url[2:3] != '/':
929 req.type = 'ftp'
930 return self.parent.open(req)
931 else:
932 return self.open_local_file(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000933
934 # names for the localhost
935 names = None
936 def get_names(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000937 if FileHandler.names is None:
Tim Peterse1190062001-01-15 03:34:38 +0000938 FileHandler.names = (socket.gethostbyname('localhost'),
Fred Drake13a2c272000-02-10 17:17:14 +0000939 socket.gethostbyname(socket.gethostname()))
940 return FileHandler.names
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000941
942 # not entirely sure what the rules are here
943 def open_local_file(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000944 host = req.get_host()
945 file = req.get_selector()
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000946 localfile = url2pathname(file)
947 stats = os.stat(localfile)
Martin v. Löwis9d3eba82002-03-18 08:37:19 +0000948 size = stats.st_size
949 modified = rfc822.formatdate(stats.st_mtime)
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000950 mtype = mimetypes.guess_type(file)[0]
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000951 headers = mimetools.Message(StringIO(
Brett Cannon783eaf42003-06-17 21:52:34 +0000952 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000953 (mtype or 'text/plain', size, modified)))
Fred Drake13a2c272000-02-10 17:17:14 +0000954 if host:
955 host, port = splitport(host)
956 if not host or \
957 (not port and socket.gethostbyname(host) in self.get_names()):
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000958 return addinfourl(open(localfile, 'rb'),
Fred Drake13a2c272000-02-10 17:17:14 +0000959 headers, 'file:'+file)
960 raise URLError('file not on local host')
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000961
962class FTPHandler(BaseHandler):
963 def ftp_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +0000964 host = req.get_host()
965 if not host:
966 raise IOError, ('ftp error', 'no host given')
967 # XXX handle custom username & password
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000968 try:
969 host = socket.gethostbyname(host)
970 except socket.error, msg:
971 raise URLError(msg)
Fred Drake13a2c272000-02-10 17:17:14 +0000972 host, port = splitport(host)
973 if port is None:
974 port = ftplib.FTP_PORT
975 path, attrs = splitattr(req.get_selector())
976 path = unquote(path)
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000977 dirs = path.split('/')
Fred Drake13a2c272000-02-10 17:17:14 +0000978 dirs, file = dirs[:-1], dirs[-1]
979 if dirs and not dirs[0]:
980 dirs = dirs[1:]
981 user = passwd = '' # XXX
982 try:
983 fw = self.connect_ftp(user, passwd, host, port, dirs)
984 type = file and 'I' or 'D'
985 for attr in attrs:
986 attr, value = splitattr(attr)
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000987 if attr.lower() == 'type' and \
Fred Drake13a2c272000-02-10 17:17:14 +0000988 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000989 type = value.upper()
Fred Drake13a2c272000-02-10 17:17:14 +0000990 fp, retrlen = fw.retrfile(file, type)
Guido van Rossum833a8d82001-08-24 13:10:13 +0000991 headers = ""
992 mtype = mimetypes.guess_type(req.get_full_url())[0]
993 if mtype:
Brett Cannon783eaf42003-06-17 21:52:34 +0000994 headers += "Content-type: %s\n" % mtype
Fred Drake13a2c272000-02-10 17:17:14 +0000995 if retrlen is not None and retrlen >= 0:
Brett Cannon783eaf42003-06-17 21:52:34 +0000996 headers += "Content-length: %d\n" % retrlen
Guido van Rossum833a8d82001-08-24 13:10:13 +0000997 sf = StringIO(headers)
998 headers = mimetools.Message(sf)
Fred Drake13a2c272000-02-10 17:17:14 +0000999 return addinfourl(fp, headers, req.get_full_url())
1000 except ftplib.all_errors, msg:
1001 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001002
1003 def connect_ftp(self, user, passwd, host, port, dirs):
1004 fw = ftpwrapper(user, passwd, host, port, dirs)
1005## fw.ftp.set_debuglevel(1)
1006 return fw
1007
1008class CacheFTPHandler(FTPHandler):
1009 # XXX would be nice to have pluggable cache strategies
1010 # XXX this stuff is definitely not thread safe
1011 def __init__(self):
1012 self.cache = {}
1013 self.timeout = {}
1014 self.soonest = 0
1015 self.delay = 60
Fred Drake13a2c272000-02-10 17:17:14 +00001016 self.max_conns = 16
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001017
1018 def setTimeout(self, t):
1019 self.delay = t
1020
1021 def setMaxConns(self, m):
Fred Drake13a2c272000-02-10 17:17:14 +00001022 self.max_conns = m
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001023
1024 def connect_ftp(self, user, passwd, host, port, dirs):
1025 key = user, passwd, host, port
Raymond Hettinger54f02222002-06-01 14:18:47 +00001026 if key in self.cache:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001027 self.timeout[key] = time.time() + self.delay
1028 else:
1029 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1030 self.timeout[key] = time.time() + self.delay
Fred Drake13a2c272000-02-10 17:17:14 +00001031 self.check_cache()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001032 return self.cache[key]
1033
1034 def check_cache(self):
Fred Drake13a2c272000-02-10 17:17:14 +00001035 # first check for old ones
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001036 t = time.time()
1037 if self.soonest <= t:
Raymond Hettinger4ec4fa22003-05-23 08:51:51 +00001038 for k, v in self.timeout.items():
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001039 if v < t:
1040 self.cache[k].close()
1041 del self.cache[k]
1042 del self.timeout[k]
1043 self.soonest = min(self.timeout.values())
1044
1045 # then check the size
Fred Drake13a2c272000-02-10 17:17:14 +00001046 if len(self.cache) == self.max_conns:
Brett Cannonc8b188a2003-05-17 19:51:26 +00001047 for k, v in self.timeout.items():
Fred Drake13a2c272000-02-10 17:17:14 +00001048 if v == self.soonest:
1049 del self.cache[k]
1050 del self.timeout[k]
1051 break
1052 self.soonest = min(self.timeout.values())
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001053
1054class GopherHandler(BaseHandler):
1055 def gopher_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +00001056 host = req.get_host()
1057 if not host:
1058 raise GopherError('no host given')
1059 host = unquote(host)
1060 selector = req.get_selector()
1061 type, selector = splitgophertype(selector)
1062 selector, query = splitquery(selector)
1063 selector = unquote(selector)
1064 if query:
1065 query = unquote(query)
1066 fp = gopherlib.send_query(selector, query, host)
1067 else:
1068 fp = gopherlib.send_selector(selector, host)
1069 return addinfourl(fp, noheaders(), req.get_full_url())
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001070
1071#bleck! don't use this yet
1072class OpenerFactory:
1073
1074 default_handlers = [UnknownHandler, HTTPHandler,
Tim Peterse1190062001-01-15 03:34:38 +00001075 HTTPDefaultErrorHandler, HTTPRedirectHandler,
Fred Drake13a2c272000-02-10 17:17:14 +00001076 FTPHandler, FileHandler]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001077 handlers = []
1078 replacement_handlers = []
1079
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001080 def add_handler(self, h):
Fred Drake13a2c272000-02-10 17:17:14 +00001081 self.handlers = self.handlers + [h]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001082
1083 def replace_handler(self, h):
Fred Drake13a2c272000-02-10 17:17:14 +00001084 pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001085
1086 def build_opener(self):
Jeremy Hylton54e99e82001-08-07 21:12:25 +00001087 opener = OpenerDirector()
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +00001088 for ph in self.default_handlers:
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001089 if inspect.isclass(ph):
Fred Drake13a2c272000-02-10 17:17:14 +00001090 ph = ph()
1091 opener.add_handler(ph)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001092
1093if __name__ == "__main__":
Tim Peterse1190062001-01-15 03:34:38 +00001094 # XXX some of the test code depends on machine configurations that
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001095 # are internal to CNRI. Need to set up a public server with the
1096 # right authentication configuration for test purposes.
1097 if socket.gethostname() == 'bitdiddle':
1098 localhost = 'bitdiddle.cnri.reston.va.us'
Jeremy Hylton73574ee2000-10-12 18:54:18 +00001099 elif socket.gethostname() == 'bitdiddle.concentric.net':
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001100 localhost = 'localhost'
1101 else:
1102 localhost = None
1103 urls = [
Fred Drake13a2c272000-02-10 17:17:14 +00001104 # Thanks to Fred for finding these!
1105 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1106 'gopher://gopher.vt.edu:10010/10/33',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001107
Fred Drake13a2c272000-02-10 17:17:14 +00001108 'file:/etc/passwd',
1109 'file://nonsensename/etc/passwd',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001110 'ftp://www.python.org/pub/python/misc/sousa.au',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001111 'ftp://www.python.org/pub/tmp/blat',
Fred Drake13a2c272000-02-10 17:17:14 +00001112 'http://www.espn.com/', # redirect
1113 'http://www.python.org/Spanish/Inquistion/',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001114 ('http://www.python.org/cgi-bin/faqw.py',
Fred Drake13a2c272000-02-10 17:17:14 +00001115 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1116 'http://www.python.org/',
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001117 'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001118 ]
1119
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001120## if localhost is not None:
1121## urls = urls + [
1122## 'file://%s/etc/passwd' % localhost,
1123## 'http://%s/simple/' % localhost,
1124## 'http://%s/digest/' % localhost,
1125## 'http://%s/not/found.h' % localhost,
1126## ]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001127
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001128## bauth = HTTPBasicAuthHandler()
1129## bauth.add_password('basic_test_realm', localhost, 'jhylton',
1130## 'password')
1131## dauth = HTTPDigestAuthHandler()
1132## dauth.add_password('digest_test_realm', localhost, 'jhylton',
1133## 'password')
Tim Peterse1190062001-01-15 03:34:38 +00001134
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001135
1136 cfh = CacheFTPHandler()
1137 cfh.setTimeout(1)
1138
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001139## # XXX try out some custom proxy objects too!
1140## def at_cnri(req):
1141## host = req.get_host()
1142## print host
1143## if host[-18:] == '.cnri.reston.va.us':
1144## return 1
1145## p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1146## ph = CustomProxyHandler(p)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001147
Jeremy Hylton8b78b992001-10-09 16:18:45 +00001148## install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1149 install_opener(build_opener(cfh, GopherHandler))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001150
1151 for url in urls:
Walter Dörwald65230a22002-06-03 15:58:32 +00001152 if isinstance(url, tuple):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001153 url, req = url
1154 else:
1155 req = None
1156 print url
1157 try:
1158 f = urlopen(url, req)
1159 except IOError, err:
Fred Drake13a2c272000-02-10 17:17:14 +00001160 print "IOError:", err
1161 except socket.error, err:
1162 print "socket.error:", err
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001163 else:
1164 buf = f.read()
1165 f.close()
1166 print "read %d bytes" % len(buf)
1167 print
1168 time.sleep(0.1)