blob: a0be039c44f8775705484607cbb913c61999d82b [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""An extensible library for opening URLs using a variety of protocols
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00002
3The simplest way to use this module is to call the urlopen function,
Tim Peterse1190062001-01-15 03:34:38 +00004which accepts a string containing a URL or a Request object (described
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00005below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
Jeremy Hyltone1906632002-10-11 17:27:55 +00008The OpenerDirector manages a collection of Handler objects that do
Tim Peterse1190062001-01-15 03:34:38 +00009all the actual work. Each Handler implements a particular protocol or
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000010option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
Raymond Hettinger024aaa12003-04-24 15:32:12 +000014HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000016
Guido van Rossumd8faa362007-04-27 19:54:29 +000017urlopen(url, data=None) -- Basic usage is the same as original
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000018urllib. pass the url and optionally data to post to an HTTP URL, and
Tim Peterse1190062001-01-15 03:34:38 +000019get a file-like object back. One difference is that you can also pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000020a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
Guido van Rossumd8faa362007-04-27 19:54:29 +000024build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000026arguments, either instances or Handler classes that it will
Guido van Rossumd8faa362007-04-27 19:54:29 +000027instantiate. If one of the argument is a subclass of the default
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000028handler, the argument will be installed instead of the default.
29
Guido van Rossumd8faa362007-04-27 19:54:29 +000030install_opener -- Installs a new opener as the default opener.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000031
32objects of interest:
33OpenerDirector --
34
Guido van Rossumd8faa362007-04-27 19:54:29 +000035Request -- An object that encapsulates the state of a request. The
36state can be as simple as the URL. It can also include extra HTTP
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000037headers, e.g. a User-Agent.
38
39BaseHandler --
40
41exceptions:
Guido van Rossumd8faa362007-04-27 19:54:29 +000042URLError -- A subclass of IOError, individual protocols have their own
43specific subclass.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000044
Guido van Rossumd8faa362007-04-27 19:54:29 +000045HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
46as an exceptional event or valid response.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000047
48internals:
49BaseHandler and parent
50_call_chain conventions
51
52Example usage:
53
54import urllib2
55
56# set up authentication info
57authinfo = urllib2.HTTPBasicAuthHandler()
Guido van Rossumd8faa362007-04-27 19:54:29 +000058authinfo.add_password(realm='PDQ Application',
59 uri='https://mahler:8092/site-updates.py',
60 user='klem',
61 passwd='geheim$parole')
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000062
Moshe Zadka8a18e992001-03-01 08:40:42 +000063proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
64
Tim Peterse1190062001-01-15 03:34:38 +000065# build a new opener that adds authentication and caching FTP handlers
Moshe Zadka8a18e992001-03-01 08:40:42 +000066opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000067
68# install it
69urllib2.install_opener(opener)
70
71f = urllib2.urlopen('http://www.python.org/')
72
73
74"""
75
76# XXX issues:
77# If an authentication error handler that tries to perform
Fred Draked5214b02001-11-08 17:19:29 +000078# authentication for some reason but fails, how should the error be
79# signalled? The client needs to know the HTTP error code. But if
80# the handler knows that the problem was, e.g., that it didn't know
81# that hash algo that requested in the challenge, it would be good to
82# pass that information along to the client, too.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000083# ftp errors aren't handled cleanly
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000084# check digest against correct (i.e. non-apache) implementation
85
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000086# Possible extensions:
87# complex proxies XXX not sure what exactly was meant by this
88# abstract factory for opener
89
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +000090import base64
Thomas Wouters477c8d52006-05-27 19:21:47 +000091import hashlib
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000092import httplib
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000093import mimetools
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +000094import os
95import posixpath
96import random
97import re
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +000098import socket
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000099import sys
100import time
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000101import urlparse
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000102import bisect
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000103
104try:
105 from cStringIO import StringIO
106except ImportError:
107 from StringIO import StringIO
108
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000109from urllib import (unwrap, unquote, splittype, splithost, quote,
Andrew M. Kuchling33ad28b2004-08-31 11:38:12 +0000110 addinfourl, splitport, splitgophertype, splitquery,
111 splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000112
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000113# support for FileHandler, proxies via environment variables
114from urllib import localhost, url2pathname, getproxies
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000115
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000116# used in User-Agent header sent
117__version__ = sys.version[:3]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000118
119_opener = None
120def urlopen(url, data=None):
121 global _opener
122 if _opener is None:
123 _opener = build_opener()
124 return _opener.open(url, data)
125
126def install_opener(opener):
127 global _opener
128 _opener = opener
129
130# do these error classes make sense?
Tim Peterse1190062001-01-15 03:34:38 +0000131# make sure all of the IOError stuff is overridden. we just want to be
Fred Drakea87a5212002-08-13 13:59:55 +0000132# subtypes.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000133
134class URLError(IOError):
135 # URLError is a sub-type of IOError, but it doesn't share any of
Jeremy Hylton0a4a50d2003-10-06 05:15:13 +0000136 # the implementation. need to override __init__ and __str__.
137 # It sets self.args for compatibility with other EnvironmentError
138 # subclasses, but args doesn't have the typical format with errno in
139 # slot 0 and strerror in slot 1. This may be better than nothing.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000140 def __init__(self, reason):
Jeremy Hylton0a4a50d2003-10-06 05:15:13 +0000141 self.args = reason,
Fred Drake13a2c272000-02-10 17:17:14 +0000142 self.reason = reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000143
144 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000145 return '<urlopen error %s>' % self.reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000146
147class HTTPError(URLError, addinfourl):
148 """Raised when HTTP error occurs, but also acts like non-error return"""
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000149 __super_init = addinfourl.__init__
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000150
151 def __init__(self, url, code, msg, hdrs, fp):
Fred Drake13a2c272000-02-10 17:17:14 +0000152 self.code = code
153 self.msg = msg
154 self.hdrs = hdrs
155 self.fp = fp
Fred Drake13a2c272000-02-10 17:17:14 +0000156 self.filename = url
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000157 # The addinfourl classes depend on fp being a valid file
158 # object. In some cases, the HTTPError may not have a valid
159 # file object. If this happens, the simplest workaround is to
Tim Petersc411dba2002-07-16 21:35:23 +0000160 # not initialize the base classes.
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000161 if fp is not None:
162 self.__super_init(fp, hdrs, url)
Tim Peterse1190062001-01-15 03:34:38 +0000163
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000164 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000165 return 'HTTP Error %s: %s' % (self.code, self.msg)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000166
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000167class GopherError(URLError):
168 pass
169
Thomas Wouters477c8d52006-05-27 19:21:47 +0000170# copied from cookielib.py
171_cut_port_re = re.compile(r":\d+$")
172def request_host(request):
173 """Return request-host, as defined by RFC 2965.
174
175 Variation from RFC: returned value is lowercased, for convenient
176 comparison.
177
178 """
179 url = request.get_full_url()
180 host = urlparse.urlparse(url)[1]
181 if host == "":
182 host = request.get_header("Host", "")
183
184 # remove port, if present
185 host = _cut_port_re.sub("", host, 1)
186 return host.lower()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000187
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000188class Request:
Moshe Zadka8a18e992001-03-01 08:40:42 +0000189
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000190 def __init__(self, url, data=None, headers={},
191 origin_req_host=None, unverifiable=False):
Fred Drake13a2c272000-02-10 17:17:14 +0000192 # unwrap('<URL:type://host/path>') --> 'type://host/path'
193 self.__original = unwrap(url)
194 self.type = None
195 # self.__r_type is what's left after doing the splittype
196 self.host = None
197 self.port = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000198 self.data = data
Fred Drake13a2c272000-02-10 17:17:14 +0000199 self.headers = {}
Brett Cannonc8b188a2003-05-17 19:51:26 +0000200 for key, value in headers.items():
Brett Cannon86503b12003-05-12 07:29:42 +0000201 self.add_header(key, value)
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000202 self.unredirected_hdrs = {}
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000203 if origin_req_host is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000204 origin_req_host = request_host(self)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000205 self.origin_req_host = origin_req_host
206 self.unverifiable = unverifiable
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000207
208 def __getattr__(self, attr):
Fred Drake13a2c272000-02-10 17:17:14 +0000209 # XXX this is a fallback mechanism to guard against these
Tim Peterse1190062001-01-15 03:34:38 +0000210 # methods getting called in a non-standard order. this may be
Fred Drake13a2c272000-02-10 17:17:14 +0000211 # too complicated and/or unnecessary.
212 # XXX should the __r_XXX attributes be public?
213 if attr[:12] == '_Request__r_':
214 name = attr[12:]
215 if hasattr(Request, 'get_' + name):
216 getattr(self, 'get_' + name)()
217 return getattr(self, attr)
218 raise AttributeError, attr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000219
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000220 def get_method(self):
221 if self.has_data():
222 return "POST"
223 else:
224 return "GET"
225
Jeremy Hylton023518a2003-12-17 18:52:16 +0000226 # XXX these helper methods are lame
227
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000228 def add_data(self, data):
229 self.data = data
230
231 def has_data(self):
232 return self.data is not None
233
234 def get_data(self):
235 return self.data
236
237 def get_full_url(self):
238 return self.__original
239
240 def get_type(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000241 if self.type is None:
242 self.type, self.__r_type = splittype(self.__original)
Jeremy Hylton78cae612001-05-09 15:49:24 +0000243 if self.type is None:
244 raise ValueError, "unknown url type: %s" % self.__original
Fred Drake13a2c272000-02-10 17:17:14 +0000245 return self.type
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000246
247 def get_host(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000248 if self.host is None:
249 self.host, self.__r_host = splithost(self.__r_type)
250 if self.host:
251 self.host = unquote(self.host)
252 return self.host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000253
254 def get_selector(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000255 return self.__r_host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000256
Moshe Zadka8a18e992001-03-01 08:40:42 +0000257 def set_proxy(self, host, type):
258 self.host, self.type = host, type
Fred Drake13a2c272000-02-10 17:17:14 +0000259 self.__r_host = self.__original
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000260
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000261 def get_origin_req_host(self):
262 return self.origin_req_host
263
264 def is_unverifiable(self):
265 return self.unverifiable
266
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000267 def add_header(self, key, val):
Fred Drake13a2c272000-02-10 17:17:14 +0000268 # useful for something like authentication
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000269 self.headers[key.capitalize()] = val
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000270
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000271 def add_unredirected_header(self, key, val):
272 # will not be added to a redirected request
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000273 self.unredirected_hdrs[key.capitalize()] = val
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000274
275 def has_header(self, header_name):
Neal Norwitz1cdd3632004-06-07 03:49:50 +0000276 return (header_name in self.headers or
277 header_name in self.unredirected_hdrs)
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000278
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000279 def get_header(self, header_name, default=None):
280 return self.headers.get(
281 header_name,
282 self.unredirected_hdrs.get(header_name, default))
283
284 def header_items(self):
285 hdrs = self.unredirected_hdrs.copy()
286 hdrs.update(self.headers)
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000287 return list(hdrs.items())
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000288
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000289class OpenerDirector:
290 def __init__(self):
Georg Brandl8d457c72005-06-26 22:01:35 +0000291 client_version = "Python-urllib/%s" % __version__
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000292 self.addheaders = [('User-agent', client_version)]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000293 # manage the individual handlers
294 self.handlers = []
295 self.handle_open = {}
296 self.handle_error = {}
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000297 self.process_response = {}
298 self.process_request = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000299
300 def add_handler(self, handler):
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000301 added = False
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000302 for meth in dir(handler):
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000303 if meth in ["redirect_request", "do_open", "proxy_open"]:
304 # oops, coincidental match
305 continue
306
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000307 i = meth.find("_")
308 protocol = meth[:i]
309 condition = meth[i+1:]
310
311 if condition.startswith("error"):
Neal Norwitz1cdd3632004-06-07 03:49:50 +0000312 j = condition.find("_") + i + 1
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000313 kind = meth[j+1:]
314 try:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000315 kind = int(kind)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000316 except ValueError:
317 pass
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000318 lookup = self.handle_error.get(protocol, {})
319 self.handle_error[protocol] = lookup
320 elif condition == "open":
321 kind = protocol
Raymond Hettingerf7bf02d2005-02-05 14:37:06 +0000322 lookup = self.handle_open
323 elif condition == "response":
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000324 kind = protocol
Raymond Hettingerf7bf02d2005-02-05 14:37:06 +0000325 lookup = self.process_response
326 elif condition == "request":
327 kind = protocol
328 lookup = self.process_request
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000329 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000330 continue
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000331
332 handlers = lookup.setdefault(kind, [])
333 if handlers:
334 bisect.insort(handlers, handler)
335 else:
336 handlers.append(handler)
337 added = True
338
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000339 if added:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000340 # the handlers must work in an specific order, the order
341 # is specified in a Handler attribute
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000342 bisect.insort(self.handlers, handler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000343 handler.add_parent(self)
Tim Peterse1190062001-01-15 03:34:38 +0000344
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000345 def close(self):
Jeremy Hyltondce391c2003-12-15 16:08:48 +0000346 # Only exists for backwards compatibility.
347 pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000348
349 def _call_chain(self, chain, kind, meth_name, *args):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000350 # Handlers raise an exception if no one else should try to handle
351 # the request, or return None if they can't but another handler
352 # could. Otherwise, they return the response.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000353 handlers = chain.get(kind, ())
354 for handler in handlers:
355 func = getattr(handler, meth_name)
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000356
357 result = func(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000358 if result is not None:
359 return result
360
361 def open(self, fullurl, data=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000362 # accept a URL or a Request object
Walter Dörwald65230a22002-06-03 15:58:32 +0000363 if isinstance(fullurl, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000364 req = Request(fullurl, data)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000365 else:
366 req = fullurl
367 if data is not None:
368 req.add_data(data)
Tim Peterse1190062001-01-15 03:34:38 +0000369
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000370 protocol = req.get_type()
371
372 # pre-process request
373 meth_name = protocol+"_request"
374 for processor in self.process_request.get(protocol, []):
375 meth = getattr(processor, meth_name)
376 req = meth(req)
377
378 response = self._open(req, data)
379
380 # post-process response
381 meth_name = protocol+"_response"
382 for processor in self.process_response.get(protocol, []):
383 meth = getattr(processor, meth_name)
384 response = meth(req, response)
385
386 return response
387
388 def _open(self, req, data=None):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000389 result = self._call_chain(self.handle_open, 'default',
Tim Peterse1190062001-01-15 03:34:38 +0000390 'default_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000391 if result:
392 return result
393
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000394 protocol = req.get_type()
395 result = self._call_chain(self.handle_open, protocol, protocol +
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000396 '_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000397 if result:
398 return result
399
400 return self._call_chain(self.handle_open, 'unknown',
401 'unknown_open', req)
402
403 def error(self, proto, *args):
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000404 if proto in ('http', 'https'):
Fred Draked5214b02001-11-08 17:19:29 +0000405 # XXX http[s] protocols are special-cased
406 dict = self.handle_error['http'] # https is not different than http
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000407 proto = args[2] # YUCK!
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000408 meth_name = 'http_error_%s' % proto
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000409 http_err = 1
410 orig_args = args
411 else:
412 dict = self.handle_error
413 meth_name = proto + '_error'
414 http_err = 0
415 args = (dict, proto, meth_name) + args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000416 result = self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000417 if result:
418 return result
419
420 if http_err:
421 args = (dict, 'default', 'http_error_default') + orig_args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000422 return self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000423
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000424# XXX probably also want an abstract factory that knows when it makes
425# sense to skip a superclass in favor of a subclass and when it might
426# make sense to include both
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000427
428def build_opener(*handlers):
429 """Create an opener object from a list of handlers.
430
431 The opener will use several default handlers, including support
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000432 for HTTP and FTP.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000433
434 If any of the handlers passed as arguments are subclasses of the
435 default handlers, the default handlers will not be used.
436 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000437 import types
438 def isclass(obj):
439 return isinstance(obj, types.ClassType) or hasattr(obj, "__bases__")
Tim Peterse1190062001-01-15 03:34:38 +0000440
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000441 opener = OpenerDirector()
442 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
443 HTTPDefaultErrorHandler, HTTPRedirectHandler,
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000444 FTPHandler, FileHandler, HTTPErrorProcessor]
Moshe Zadka8a18e992001-03-01 08:40:42 +0000445 if hasattr(httplib, 'HTTPS'):
446 default_classes.append(HTTPSHandler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000447 skip = []
448 for klass in default_classes:
449 for check in handlers:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450 if isclass(check):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000451 if issubclass(check, klass):
452 skip.append(klass)
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000453 elif isinstance(check, klass):
454 skip.append(klass)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000455 for klass in skip:
456 default_classes.remove(klass)
457
458 for klass in default_classes:
459 opener.add_handler(klass())
460
461 for h in handlers:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000462 if isclass(h):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000463 h = h()
464 opener.add_handler(h)
465 return opener
466
467class BaseHandler:
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000468 handler_order = 500
469
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000470 def add_parent(self, parent):
471 self.parent = parent
Tim Peters58eb11c2004-01-18 20:29:55 +0000472
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000473 def close(self):
Jeremy Hyltondce391c2003-12-15 16:08:48 +0000474 # Only exists for backwards compatibility
475 pass
Tim Peters58eb11c2004-01-18 20:29:55 +0000476
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000477 def __lt__(self, other):
478 if not hasattr(other, "handler_order"):
479 # Try to preserve the old behavior of having custom classes
480 # inserted after default ones (works only for custom user
481 # classes which are not aware of handler_order).
482 return True
483 return self.handler_order < other.handler_order
Tim Petersf545baa2003-06-15 23:26:30 +0000484
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000485
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000486class HTTPErrorProcessor(BaseHandler):
487 """Process HTTP error responses."""
488 handler_order = 1000 # after all other processing
489
490 def http_response(self, request, response):
491 code, msg, hdrs = response.code, response.msg, response.info()
492
Guido van Rossumd8faa362007-04-27 19:54:29 +0000493 # According to RFC 2616, "2xx" code indicates that the client's
494 # request was successfully received, understood, and accepted.
495 if not (200 <= code < 300):
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000496 response = self.parent.error(
497 'http', request, response, code, msg, hdrs)
498
499 return response
500
501 https_response = http_response
502
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000503class HTTPDefaultErrorHandler(BaseHandler):
504 def http_error_default(self, req, fp, code, msg, hdrs):
Fred Drake13a2c272000-02-10 17:17:14 +0000505 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000506
507class HTTPRedirectHandler(BaseHandler):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000508 # maximum number of redirections to any single URL
509 # this is needed because of the state that cookies introduce
510 max_repeats = 4
511 # maximum total number of redirections (regardless of URL) before
512 # assuming we're in a loop
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000513 max_redirections = 10
514
Jeremy Hylton03892952003-05-05 04:09:13 +0000515 def redirect_request(self, req, fp, code, msg, headers, newurl):
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000516 """Return a Request or None in response to a redirect.
517
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000518 This is called by the http_error_30x methods when a
519 redirection response is received. If a redirection should
520 take place, return a new Request to allow http_error_30x to
521 perform the redirect. Otherwise, raise HTTPError if no-one
522 else should try to handle this url. Return None if you can't
523 but another Handler might.
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000524 """
Jeremy Hylton828023b2003-05-04 23:44:49 +0000525 m = req.get_method()
526 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
Martin v. Löwis162f0812003-07-12 07:33:32 +0000527 or code in (301, 302, 303) and m == "POST"):
528 # Strictly (according to RFC 2616), 301 or 302 in response
529 # to a POST MUST NOT cause a redirection without confirmation
Jeremy Hylton828023b2003-05-04 23:44:49 +0000530 # from the user (of urllib2, in this case). In practice,
531 # essentially all clients do redirect in this case, so we
532 # do the same.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000533 # be conciliant with URIs containing a space
534 newurl = newurl.replace(' ', '%20')
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000535 return Request(newurl,
536 headers=req.headers,
537 origin_req_host=req.get_origin_req_host(),
538 unverifiable=True)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000539 else:
Martin v. Löwise3b67bc2003-06-14 05:51:25 +0000540 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000541
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000542 # Implementation note: To avoid the server sending us into an
543 # infinite loop, the request object needs to track what URLs we
544 # have already seen. Do this by adding a handler-specific
545 # attribute to the Request object.
546 def http_error_302(self, req, fp, code, msg, headers):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000547 # Some servers (incorrectly) return multiple Location headers
548 # (so probably same goes for URI). Use first header.
Raymond Hettinger54f02222002-06-01 14:18:47 +0000549 if 'location' in headers:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000550 newurl = headers.getheaders('location')[0]
Raymond Hettinger54f02222002-06-01 14:18:47 +0000551 elif 'uri' in headers:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000552 newurl = headers.getheaders('uri')[0]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000553 else:
554 return
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000555 newurl = urlparse.urljoin(req.get_full_url(), newurl)
556
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000557 # XXX Probably want to forget about the state of the current
558 # request, although that might interact poorly with other
559 # handlers that also use handler-specific request attributes
Jeremy Hylton03892952003-05-05 04:09:13 +0000560 new = self.redirect_request(req, fp, code, msg, headers, newurl)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000561 if new is None:
562 return
563
564 # loop detection
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000565 # .redirect_dict has a key url if url was previously visited.
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000566 if hasattr(req, 'redirect_dict'):
567 visited = new.redirect_dict = req.redirect_dict
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000568 if (visited.get(newurl, 0) >= self.max_repeats or
569 len(visited) >= self.max_redirections):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000570 raise HTTPError(req.get_full_url(), code,
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000571 self.inf_msg + msg, headers, fp)
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000572 else:
573 visited = new.redirect_dict = req.redirect_dict = {}
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000574 visited[newurl] = visited.get(newurl, 0) + 1
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000575
576 # Don't close the fp until we are sure that we won't use it
Tim Petersab9ba272001-08-09 21:40:30 +0000577 # with HTTPError.
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000578 fp.read()
579 fp.close()
580
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000581 return self.parent.open(new)
582
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000583 http_error_301 = http_error_303 = http_error_307 = http_error_302
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000584
Martin v. Löwis162f0812003-07-12 07:33:32 +0000585 inf_msg = "The HTTP server returned a redirect error that would " \
Thomas Wouters7e474022000-07-16 12:04:32 +0000586 "lead to an infinite loop.\n" \
Martin v. Löwis162f0812003-07-12 07:33:32 +0000587 "The last 30x error message was:\n"
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000588
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000589
590def _parse_proxy(proxy):
591 """Return (scheme, user, password, host/port) given a URL or an authority.
592
593 If a URL is supplied, it must have an authority (host:port) component.
594 According to RFC 3986, having an authority component means the URL must
595 have two slashes after the scheme:
596
597 >>> _parse_proxy('file:/ftp.example.com/')
598 Traceback (most recent call last):
599 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
600
601 The first three items of the returned tuple may be None.
602
603 Examples of authority parsing:
604
605 >>> _parse_proxy('proxy.example.com')
606 (None, None, None, 'proxy.example.com')
607 >>> _parse_proxy('proxy.example.com:3128')
608 (None, None, None, 'proxy.example.com:3128')
609
610 The authority component may optionally include userinfo (assumed to be
611 username:password):
612
613 >>> _parse_proxy('joe:password@proxy.example.com')
614 (None, 'joe', 'password', 'proxy.example.com')
615 >>> _parse_proxy('joe:password@proxy.example.com:3128')
616 (None, 'joe', 'password', 'proxy.example.com:3128')
617
618 Same examples, but with URLs instead:
619
620 >>> _parse_proxy('http://proxy.example.com/')
621 ('http', None, None, 'proxy.example.com')
622 >>> _parse_proxy('http://proxy.example.com:3128/')
623 ('http', None, None, 'proxy.example.com:3128')
624 >>> _parse_proxy('http://joe:password@proxy.example.com/')
625 ('http', 'joe', 'password', 'proxy.example.com')
626 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
627 ('http', 'joe', 'password', 'proxy.example.com:3128')
628
629 Everything after the authority is ignored:
630
631 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
632 ('ftp', 'joe', 'password', 'proxy.example.com')
633
634 Test for no trailing '/' case:
635
636 >>> _parse_proxy('http://joe:password@proxy.example.com')
637 ('http', 'joe', 'password', 'proxy.example.com')
638
639 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000640 scheme, r_scheme = splittype(proxy)
641 if not r_scheme.startswith("/"):
642 # authority
643 scheme = None
644 authority = proxy
645 else:
646 # URL
647 if not r_scheme.startswith("//"):
648 raise ValueError("proxy URL with no authority: %r" % proxy)
649 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
650 # and 3.3.), path is empty or starts with '/'
651 end = r_scheme.find("/", 2)
652 if end == -1:
653 end = None
654 authority = r_scheme[2:end]
655 userinfo, hostport = splituser(authority)
656 if userinfo is not None:
657 user, password = splitpasswd(userinfo)
658 else:
659 user = password = None
660 return scheme, user, password, hostport
661
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000662class ProxyHandler(BaseHandler):
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000663 # Proxies must be in front
664 handler_order = 100
665
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000666 def __init__(self, proxies=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000667 if proxies is None:
668 proxies = getproxies()
Guido van Rossume2b70bc2006-08-18 22:13:04 +0000669 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
Fred Drake13a2c272000-02-10 17:17:14 +0000670 self.proxies = proxies
Brett Cannondf0d87a2003-05-18 02:25:07 +0000671 for type, url in proxies.items():
Tim Peterse1190062001-01-15 03:34:38 +0000672 setattr(self, '%s_open' % type,
Fred Drake13a2c272000-02-10 17:17:14 +0000673 lambda r, proxy=url, type=type, meth=self.proxy_open: \
674 meth(r, proxy, type))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000675
676 def proxy_open(self, req, proxy, type):
Fred Drake13a2c272000-02-10 17:17:14 +0000677 orig_type = req.get_type()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000678 proxy_type, user, password, hostport = _parse_proxy(proxy)
679 if proxy_type is None:
680 proxy_type = orig_type
Georg Brandl531ceba2006-01-21 07:20:56 +0000681 if user and password:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000682 user_pass = '%s:%s' % (unquote(user), unquote(password))
Thomas Wouters89f507f2006-12-13 04:49:30 +0000683 creds = base64.b64encode(user_pass).strip()
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000684 req.add_header('Proxy-authorization', 'Basic ' + creds)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000685 hostport = unquote(hostport)
686 req.set_proxy(hostport, proxy_type)
687 if orig_type == proxy_type:
Fred Drake13a2c272000-02-10 17:17:14 +0000688 # let other handlers take care of it
Fred Drake13a2c272000-02-10 17:17:14 +0000689 return None
690 else:
691 # need to start over, because the other handlers don't
692 # grok the proxy's URL type
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000693 # e.g. if we have a constructor arg proxies like so:
694 # {'http': 'ftp://proxy.example.com'}, we may end up turning
695 # a request for http://acme.example.com/a into one for
696 # ftp://proxy.example.com/a
Fred Drake13a2c272000-02-10 17:17:14 +0000697 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000698
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000699class HTTPPasswordMgr:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000700
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000701 def __init__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000702 self.passwd = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000703
704 def add_password(self, realm, uri, user, passwd):
Fred Drake13a2c272000-02-10 17:17:14 +0000705 # uri could be a single URI or a sequence
Walter Dörwald65230a22002-06-03 15:58:32 +0000706 if isinstance(uri, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000707 uri = [uri]
Raymond Hettinger54f02222002-06-01 14:18:47 +0000708 if not realm in self.passwd:
Fred Drake13a2c272000-02-10 17:17:14 +0000709 self.passwd[realm] = {}
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000710 for default_port in True, False:
711 reduced_uri = tuple(
712 [self.reduce_uri(u, default_port) for u in uri])
713 self.passwd[realm][reduced_uri] = (user, passwd)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000714
715 def find_user_password(self, realm, authuri):
Fred Drake13a2c272000-02-10 17:17:14 +0000716 domains = self.passwd.get(realm, {})
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000717 for default_port in True, False:
718 reduced_authuri = self.reduce_uri(authuri, default_port)
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000719 for uris, authinfo in domains.items():
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000720 for uri in uris:
721 if self.is_suburi(uri, reduced_authuri):
722 return authinfo
Fred Drake13a2c272000-02-10 17:17:14 +0000723 return None, None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000724
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000725 def reduce_uri(self, uri, default_port=True):
726 """Accept authority or URI and extract only the authority and path."""
727 # note HTTP URLs do not have a userinfo component
Thomas Wouters477c8d52006-05-27 19:21:47 +0000728 parts = urlparse.urlsplit(uri)
Fred Drake13a2c272000-02-10 17:17:14 +0000729 if parts[1]:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000730 # URI
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000731 scheme = parts[0]
732 authority = parts[1]
733 path = parts[2] or '/'
Fred Drake13a2c272000-02-10 17:17:14 +0000734 else:
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000735 # host or host:port
736 scheme = None
737 authority = uri
738 path = '/'
739 host, port = splitport(authority)
740 if default_port and port is None and scheme is not None:
741 dport = {"http": 80,
742 "https": 443,
743 }.get(scheme)
744 if dport is not None:
745 authority = "%s:%d" % (host, dport)
746 return authority, path
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000747
748 def is_suburi(self, base, test):
Fred Drake13a2c272000-02-10 17:17:14 +0000749 """Check if test is below base in a URI tree
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000750
Fred Drake13a2c272000-02-10 17:17:14 +0000751 Both args must be URIs in reduced form.
752 """
753 if base == test:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000754 return True
Fred Drake13a2c272000-02-10 17:17:14 +0000755 if base[0] != test[0]:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000756 return False
Moshe Zadka8a18e992001-03-01 08:40:42 +0000757 common = posixpath.commonprefix((base[1], test[1]))
Fred Drake13a2c272000-02-10 17:17:14 +0000758 if len(common) == len(base[1]):
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000759 return True
760 return False
Tim Peterse1190062001-01-15 03:34:38 +0000761
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000762
Moshe Zadka8a18e992001-03-01 08:40:42 +0000763class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
764
765 def find_user_password(self, realm, authuri):
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000766 user, password = HTTPPasswordMgr.find_user_password(self, realm,
767 authuri)
Moshe Zadka8a18e992001-03-01 08:40:42 +0000768 if user is not None:
769 return user, password
770 return HTTPPasswordMgr.find_user_password(self, None, authuri)
771
772
773class AbstractBasicAuthHandler:
774
Guido van Rossumd8faa362007-04-27 19:54:29 +0000775 # XXX this allows for multiple auth-schemes, but will stupidly pick
776 # the last one with a realm specified.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000777
Guido van Rossumd8faa362007-04-27 19:54:29 +0000778 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000779
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000780 # XXX could pre-emptively send auth info already accepted (RFC 2617,
781 # end of section 2, and section 1.2 immediately after "credentials"
782 # production).
783
Moshe Zadka8a18e992001-03-01 08:40:42 +0000784 def __init__(self, password_mgr=None):
785 if password_mgr is None:
786 password_mgr = HTTPPasswordMgr()
787 self.passwd = password_mgr
Fred Drake13a2c272000-02-10 17:17:14 +0000788 self.add_password = self.passwd.add_password
Tim Peterse1190062001-01-15 03:34:38 +0000789
Moshe Zadka8a18e992001-03-01 08:40:42 +0000790 def http_error_auth_reqed(self, authreq, host, req, headers):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000791 # host may be an authority (without userinfo) or a URL with an
792 # authority
Moshe Zadka8a18e992001-03-01 08:40:42 +0000793 # XXX could be multiple headers
794 authreq = headers.get(authreq, None)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000795 if authreq:
Martin v. Löwis65a79752004-08-03 12:59:55 +0000796 mo = AbstractBasicAuthHandler.rx.search(authreq)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000797 if mo:
798 scheme, realm = mo.groups()
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000799 if scheme.lower() == 'basic':
Moshe Zadka8a18e992001-03-01 08:40:42 +0000800 return self.retry_http_basic_auth(host, req, realm)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000801
Moshe Zadka8a18e992001-03-01 08:40:42 +0000802 def retry_http_basic_auth(self, host, req, realm):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000803 user, pw = self.passwd.find_user_password(realm, host)
Martin v. Löwis8b3e8712004-05-06 01:41:26 +0000804 if pw is not None:
Fred Drake13a2c272000-02-10 17:17:14 +0000805 raw = "%s:%s" % (user, pw)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000806 auth = 'Basic %s' % base64.b64encode(raw).strip()
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000807 if req.headers.get(self.auth_header, None) == auth:
808 return None
809 req.add_header(self.auth_header, auth)
810 return self.parent.open(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000811 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000812 return None
813
Thomas Wouters477c8d52006-05-27 19:21:47 +0000814
Moshe Zadka8a18e992001-03-01 08:40:42 +0000815class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000816
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000817 auth_header = 'Authorization'
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000818
Moshe Zadka8a18e992001-03-01 08:40:42 +0000819 def http_error_401(self, req, fp, code, msg, headers):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000820 url = req.get_full_url()
Tim Peters30edd232001-03-16 08:29:48 +0000821 return self.http_error_auth_reqed('www-authenticate',
Thomas Wouters477c8d52006-05-27 19:21:47 +0000822 url, req, headers)
Moshe Zadka8a18e992001-03-01 08:40:42 +0000823
824
825class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
826
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000827 auth_header = 'Proxy-authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000828
829 def http_error_407(self, req, fp, code, msg, headers):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000830 # http_error_auth_reqed requires that there is no userinfo component in
831 # authority. Assume there isn't one, since urllib2 does not (and
832 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
833 # userinfo.
834 authority = req.get_host()
Tim Peters30edd232001-03-16 08:29:48 +0000835 return self.http_error_auth_reqed('proxy-authenticate',
Thomas Wouters477c8d52006-05-27 19:21:47 +0000836 authority, req, headers)
Moshe Zadka8a18e992001-03-01 08:40:42 +0000837
838
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000839def randombytes(n):
840 """Return n random bytes."""
841 # Use /dev/urandom if it is available. Fall back to random module
842 # if not. It might be worthwhile to extend this function to use
843 # other platform-specific mechanisms for getting random bytes.
844 if os.path.exists("/dev/urandom"):
845 f = open("/dev/urandom")
846 s = f.read(n)
847 f.close()
848 return s
849 else:
850 L = [chr(random.randrange(0, 256)) for i in range(n)]
851 return "".join(L)
852
Moshe Zadka8a18e992001-03-01 08:40:42 +0000853class AbstractDigestAuthHandler:
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000854 # Digest authentication is specified in RFC 2617.
855
856 # XXX The client does not inspect the Authentication-Info header
857 # in a successful response.
858
859 # XXX It should be possible to test this implementation against
860 # a mock server that just generates a static set of challenges.
861
862 # XXX qop="auth-int" supports is shaky
Moshe Zadka8a18e992001-03-01 08:40:42 +0000863
864 def __init__(self, passwd=None):
865 if passwd is None:
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000866 passwd = HTTPPasswordMgr()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000867 self.passwd = passwd
Fred Drake13a2c272000-02-10 17:17:14 +0000868 self.add_password = self.passwd.add_password
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000869 self.retried = 0
870 self.nonce_count = 0
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000871
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000872 def reset_retry_count(self):
873 self.retried = 0
874
875 def http_error_auth_reqed(self, auth_header, host, req, headers):
876 authreq = headers.get(auth_header, None)
877 if self.retried > 5:
878 # Don't fail endlessly - if we failed once, we'll probably
879 # fail a second time. Hm. Unless the Password Manager is
880 # prompting for the information. Crap. This isn't great
881 # but it's better than the current 'repeat until recursion
882 # depth exceeded' approach <wink>
Tim Peters58eb11c2004-01-18 20:29:55 +0000883 raise HTTPError(req.get_full_url(), 401, "digest auth failed",
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000884 headers, None)
885 else:
886 self.retried += 1
Fred Drake13a2c272000-02-10 17:17:14 +0000887 if authreq:
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000888 scheme = authreq.split()[0]
889 if scheme.lower() == 'digest':
Fred Drake13a2c272000-02-10 17:17:14 +0000890 return self.retry_http_digest_auth(req, authreq)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000891
892 def retry_http_digest_auth(self, req, auth):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000893 token, challenge = auth.split(' ', 1)
Fred Drake13a2c272000-02-10 17:17:14 +0000894 chal = parse_keqv_list(parse_http_list(challenge))
895 auth = self.get_authorization(req, chal)
896 if auth:
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000897 auth_val = 'Digest %s' % auth
898 if req.headers.get(self.auth_header, None) == auth_val:
899 return None
Thomas Wouters477c8d52006-05-27 19:21:47 +0000900 req.add_unredirected_header(self.auth_header, auth_val)
Fred Drake13a2c272000-02-10 17:17:14 +0000901 resp = self.parent.open(req)
Fred Drake13a2c272000-02-10 17:17:14 +0000902 return resp
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000903
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000904 def get_cnonce(self, nonce):
905 # The cnonce-value is an opaque
906 # quoted string value provided by the client and used by both client
907 # and server to avoid chosen plaintext attacks, to provide mutual
908 # authentication, and to provide some message integrity protection.
909 # This isn't a fabulous effort, but it's probably Good Enough.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000910 dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
911 randombytes(8))).hexdigest()
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000912 return dig[:16]
913
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000914 def get_authorization(self, req, chal):
Fred Drake13a2c272000-02-10 17:17:14 +0000915 try:
916 realm = chal['realm']
917 nonce = chal['nonce']
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000918 qop = chal.get('qop')
Fred Drake13a2c272000-02-10 17:17:14 +0000919 algorithm = chal.get('algorithm', 'MD5')
920 # mod_digest doesn't send an opaque, even though it isn't
921 # supposed to be optional
922 opaque = chal.get('opaque', None)
923 except KeyError:
924 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000925
Fred Drake13a2c272000-02-10 17:17:14 +0000926 H, KD = self.get_algorithm_impls(algorithm)
927 if H is None:
928 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000929
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000930 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
Fred Drake13a2c272000-02-10 17:17:14 +0000931 if user is None:
932 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000933
Fred Drake13a2c272000-02-10 17:17:14 +0000934 # XXX not implemented yet
935 if req.has_data():
936 entdig = self.get_entity_digest(req.get_data(), chal)
937 else:
938 entdig = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000939
Fred Drake13a2c272000-02-10 17:17:14 +0000940 A1 = "%s:%s:%s" % (user, realm, pw)
Johannes Gijsberscdd625a2005-01-09 05:51:49 +0000941 A2 = "%s:%s" % (req.get_method(),
Fred Drake13a2c272000-02-10 17:17:14 +0000942 # XXX selector: what about proxies and full urls
943 req.get_selector())
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000944 if qop == 'auth':
945 self.nonce_count += 1
946 ncvalue = '%08x' % self.nonce_count
947 cnonce = self.get_cnonce(nonce)
948 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
949 respdig = KD(H(A1), noncebit)
950 elif qop is None:
951 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
952 else:
953 # XXX handle auth-int.
954 pass
Tim Peters58eb11c2004-01-18 20:29:55 +0000955
Fred Drake13a2c272000-02-10 17:17:14 +0000956 # XXX should the partial digests be encoded too?
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000957
Fred Drake13a2c272000-02-10 17:17:14 +0000958 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
959 'response="%s"' % (user, realm, nonce, req.get_selector(),
960 respdig)
961 if opaque:
Jeremy Hyltonb300ae32004-12-22 14:27:19 +0000962 base += ', opaque="%s"' % opaque
Fred Drake13a2c272000-02-10 17:17:14 +0000963 if entdig:
Jeremy Hyltonb300ae32004-12-22 14:27:19 +0000964 base += ', digest="%s"' % entdig
965 base += ', algorithm="%s"' % algorithm
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000966 if qop:
Jeremy Hyltonb300ae32004-12-22 14:27:19 +0000967 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
Fred Drake13a2c272000-02-10 17:17:14 +0000968 return base
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000969
970 def get_algorithm_impls(self, algorithm):
Fred Drake13a2c272000-02-10 17:17:14 +0000971 # lambdas assume digest modules are imported at the top level
972 if algorithm == 'MD5':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000973 H = lambda x: hashlib.md5(x).hexdigest()
Fred Drake13a2c272000-02-10 17:17:14 +0000974 elif algorithm == 'SHA':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000975 H = lambda x: hashlib.sha1(x).hexdigest()
Fred Drake13a2c272000-02-10 17:17:14 +0000976 # XXX MD5-sess
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000977 KD = lambda s, d: H("%s:%s" % (s, d))
Fred Drake13a2c272000-02-10 17:17:14 +0000978 return H, KD
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000979
980 def get_entity_digest(self, data, chal):
Fred Drake13a2c272000-02-10 17:17:14 +0000981 # XXX not implemented yet
982 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000983
Moshe Zadka8a18e992001-03-01 08:40:42 +0000984
985class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
986 """An authentication protocol defined by RFC 2069
987
988 Digest authentication improves on basic authentication because it
989 does not transmit passwords in the clear.
990 """
991
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000992 auth_header = 'Authorization'
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000993 handler_order = 490 # before Basic auth
Moshe Zadka8a18e992001-03-01 08:40:42 +0000994
995 def http_error_401(self, req, fp, code, msg, headers):
996 host = urlparse.urlparse(req.get_full_url())[1]
Tim Peters58eb11c2004-01-18 20:29:55 +0000997 retry = self.http_error_auth_reqed('www-authenticate',
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000998 host, req, headers)
999 self.reset_retry_count()
1000 return retry
Moshe Zadka8a18e992001-03-01 08:40:42 +00001001
1002
1003class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1004
Jeremy Hyltonaefae552003-07-10 13:30:12 +00001005 auth_header = 'Proxy-Authorization'
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001006 handler_order = 490 # before Basic auth
Moshe Zadka8a18e992001-03-01 08:40:42 +00001007
1008 def http_error_407(self, req, fp, code, msg, headers):
1009 host = req.get_host()
Tim Peters58eb11c2004-01-18 20:29:55 +00001010 retry = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +00001011 host, req, headers)
1012 self.reset_retry_count()
1013 return retry
Tim Peterse1190062001-01-15 03:34:38 +00001014
Moshe Zadka8a18e992001-03-01 08:40:42 +00001015class AbstractHTTPHandler(BaseHandler):
1016
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001017 def __init__(self, debuglevel=0):
1018 self._debuglevel = debuglevel
1019
1020 def set_http_debuglevel(self, level):
1021 self._debuglevel = level
1022
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001023 def do_request_(self, request):
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001024 host = request.get_host()
1025 if not host:
1026 raise URLError('no host given')
1027
1028 if request.has_data(): # POST
1029 data = request.get_data()
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001030 if not request.has_header('Content-type'):
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001031 request.add_unredirected_header(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001032 'Content-type',
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001033 'application/x-www-form-urlencoded')
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001034 if not request.has_header('Content-length'):
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001035 request.add_unredirected_header(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001036 'Content-length', '%d' % len(data))
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001037
1038 scheme, sel = splittype(request.get_selector())
1039 sel_host, sel_path = splithost(sel)
1040 if not request.has_header('Host'):
1041 request.add_unredirected_header('Host', sel_host or host)
1042 for name, value in self.parent.addheaders:
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001043 name = name.capitalize()
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001044 if not request.has_header(name):
1045 request.add_unredirected_header(name, value)
1046
1047 return request
1048
Moshe Zadka8a18e992001-03-01 08:40:42 +00001049 def do_open(self, http_class, req):
Jeremy Hylton023518a2003-12-17 18:52:16 +00001050 """Return an addinfourl object for the request, using http_class.
1051
1052 http_class must implement the HTTPConnection API from httplib.
1053 The addinfourl return value is a file-like object. It also
1054 has methods and attributes including:
1055 - info(): return a mimetools.Message object for the headers
1056 - geturl(): return the original request URL
1057 - code: HTTP status code
1058 """
Moshe Zadka76676802001-04-11 07:44:53 +00001059 host = req.get_host()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001060 if not host:
1061 raise URLError('no host given')
1062
Jeremy Hylton828023b2003-05-04 23:44:49 +00001063 h = http_class(host) # will parse host:port
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001064 h.set_debuglevel(self._debuglevel)
Tim Peterse1190062001-01-15 03:34:38 +00001065
Jeremy Hylton023518a2003-12-17 18:52:16 +00001066 headers = dict(req.headers)
1067 headers.update(req.unredirected_hdrs)
Jeremy Hyltonb3ee6f92004-02-24 19:40:35 +00001068 # We want to make an HTTP/1.1 request, but the addinfourl
1069 # class isn't prepared to deal with a persistent connection.
1070 # It will try to read all remaining data from the socket,
1071 # which will block while the server waits for the next request.
1072 # So make sure the connection gets closed after the (only)
1073 # request.
1074 headers["Connection"] = "close"
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001075 headers = dict(
1076 (name.title(), val) for name, val in headers.items())
Jeremy Hylton828023b2003-05-04 23:44:49 +00001077 try:
Jeremy Hylton023518a2003-12-17 18:52:16 +00001078 h.request(req.get_method(), req.get_selector(), req.data, headers)
1079 r = h.getresponse()
Guido van Rossumb940e112007-01-10 16:19:56 +00001080 except socket.error as err: # XXX what error?
Jeremy Hylton828023b2003-05-04 23:44:49 +00001081 raise URLError(err)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001082
Andrew M. Kuchlingf9ea7c02004-07-10 15:34:34 +00001083 # Pick apart the HTTPResponse object to get the addinfourl
Jeremy Hylton5d9c3032004-08-07 17:40:50 +00001084 # object initialized properly.
1085
1086 # Wrap the HTTPResponse object in socket's file object adapter
1087 # for Windows. That adapter calls recv(), so delegate recv()
1088 # to read(). This weird wrapping allows the returned object to
1089 # have readline() and readlines() methods.
Tim Peters9ca3f852004-08-08 01:05:14 +00001090
Jeremy Hylton5d9c3032004-08-07 17:40:50 +00001091 # XXX It might be better to extract the read buffering code
1092 # out of socket._fileobject() and into a base class.
Tim Peters9ca3f852004-08-08 01:05:14 +00001093
Jeremy Hylton5d9c3032004-08-07 17:40:50 +00001094 r.recv = r.read
Thomas Woutersb2137042007-02-01 18:02:27 +00001095 fp = socket._fileobject(r, close=True)
Tim Peters9ca3f852004-08-08 01:05:14 +00001096
Jeremy Hylton5d9c3032004-08-07 17:40:50 +00001097 resp = addinfourl(fp, r.msg, req.get_full_url())
Andrew M. Kuchlingf9ea7c02004-07-10 15:34:34 +00001098 resp.code = r.status
1099 resp.msg = r.reason
1100 return resp
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001101
Moshe Zadka8a18e992001-03-01 08:40:42 +00001102
1103class HTTPHandler(AbstractHTTPHandler):
1104
1105 def http_open(self, req):
Jeremy Hylton023518a2003-12-17 18:52:16 +00001106 return self.do_open(httplib.HTTPConnection, req)
Moshe Zadka8a18e992001-03-01 08:40:42 +00001107
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001108 http_request = AbstractHTTPHandler.do_request_
Moshe Zadka8a18e992001-03-01 08:40:42 +00001109
1110if hasattr(httplib, 'HTTPS'):
1111 class HTTPSHandler(AbstractHTTPHandler):
1112
1113 def https_open(self, req):
Jeremy Hylton023518a2003-12-17 18:52:16 +00001114 return self.do_open(httplib.HTTPSConnection, req)
Moshe Zadka8a18e992001-03-01 08:40:42 +00001115
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001116 https_request = AbstractHTTPHandler.do_request_
1117
1118class HTTPCookieProcessor(BaseHandler):
1119 def __init__(self, cookiejar=None):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001120 import cookielib
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001121 if cookiejar is None:
Neal Norwitz1cdd3632004-06-07 03:49:50 +00001122 cookiejar = cookielib.CookieJar()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001123 self.cookiejar = cookiejar
1124
1125 def http_request(self, request):
1126 self.cookiejar.add_cookie_header(request)
1127 return request
1128
1129 def http_response(self, request, response):
1130 self.cookiejar.extract_cookies(response, request)
1131 return response
1132
1133 https_request = http_request
1134 https_response = http_response
Moshe Zadka8a18e992001-03-01 08:40:42 +00001135
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001136class UnknownHandler(BaseHandler):
1137 def unknown_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +00001138 type = req.get_type()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001139 raise URLError('unknown url type: %s' % type)
1140
1141def parse_keqv_list(l):
1142 """Parse list of key=value strings where keys are not duplicated."""
1143 parsed = {}
1144 for elt in l:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +00001145 k, v = elt.split('=', 1)
Fred Drake13a2c272000-02-10 17:17:14 +00001146 if v[0] == '"' and v[-1] == '"':
1147 v = v[1:-1]
1148 parsed[k] = v
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001149 return parsed
1150
1151def parse_http_list(s):
1152 """Parse lists as described by RFC 2068 Section 2.
Tim Peters9e34c042005-08-26 15:20:46 +00001153
Andrew M. Kuchling22ab06e2004-04-06 19:43:03 +00001154 In particular, parse comma-separated lists where the elements of
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001155 the list may include quoted-strings. A quoted-string could
Georg Brandle1b13d22005-08-24 22:20:32 +00001156 contain a comma. A non-quoted string could have quotes in the
1157 middle. Neither commas nor quotes count if they are escaped.
1158 Only double-quotes count, not single-quotes.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001159 """
Georg Brandle1b13d22005-08-24 22:20:32 +00001160 res = []
1161 part = ''
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001162
Georg Brandle1b13d22005-08-24 22:20:32 +00001163 escape = quote = False
1164 for cur in s:
1165 if escape:
1166 part += cur
1167 escape = False
1168 continue
1169 if quote:
1170 if cur == '\\':
1171 escape = True
Fred Drake13a2c272000-02-10 17:17:14 +00001172 continue
Georg Brandle1b13d22005-08-24 22:20:32 +00001173 elif cur == '"':
1174 quote = False
1175 part += cur
1176 continue
1177
1178 if cur == ',':
1179 res.append(part)
1180 part = ''
1181 continue
1182
1183 if cur == '"':
1184 quote = True
Tim Peters9e34c042005-08-26 15:20:46 +00001185
Georg Brandle1b13d22005-08-24 22:20:32 +00001186 part += cur
1187
1188 # append last part
1189 if part:
1190 res.append(part)
1191
1192 return [part.strip() for part in res]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001193
1194class FileHandler(BaseHandler):
1195 # Use local file or FTP depending on form of URL
1196 def file_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +00001197 url = req.get_selector()
1198 if url[:2] == '//' and url[2:3] != '/':
1199 req.type = 'ftp'
1200 return self.parent.open(req)
1201 else:
1202 return self.open_local_file(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001203
1204 # names for the localhost
1205 names = None
1206 def get_names(self):
Fred Drake13a2c272000-02-10 17:17:14 +00001207 if FileHandler.names is None:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001208 try:
1209 FileHandler.names = (socket.gethostbyname('localhost'),
1210 socket.gethostbyname(socket.gethostname()))
1211 except socket.gaierror:
1212 FileHandler.names = (socket.gethostbyname('localhost'),)
Fred Drake13a2c272000-02-10 17:17:14 +00001213 return FileHandler.names
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001214
1215 # not entirely sure what the rules are here
1216 def open_local_file(self, req):
Thomas Woutersb2137042007-02-01 18:02:27 +00001217 import email.utils
Thomas Wouters477c8d52006-05-27 19:21:47 +00001218 import mimetypes
Fred Drake13a2c272000-02-10 17:17:14 +00001219 host = req.get_host()
1220 file = req.get_selector()
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +00001221 localfile = url2pathname(file)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001222 try:
1223 stats = os.stat(localfile)
1224 size = stats.st_size
1225 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1226 mtype = mimetypes.guess_type(file)[0]
1227 headers = mimetools.Message(StringIO(
1228 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1229 (mtype or 'text/plain', size, modified)))
1230 if host:
1231 host, port = splitport(host)
1232 if not host or \
1233 (not port and socket.gethostbyname(host) in self.get_names()):
1234 return addinfourl(open(localfile, 'rb'),
1235 headers, 'file:'+file)
1236 except OSError as msg:
1237 # urllib2 users shouldn't expect OSErrors coming from urlopen()
1238 raise URLError(msg)
Fred Drake13a2c272000-02-10 17:17:14 +00001239 raise URLError('file not on local host')
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001240
1241class FTPHandler(BaseHandler):
1242 def ftp_open(self, req):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001243 import ftplib
1244 import mimetypes
Fred Drake13a2c272000-02-10 17:17:14 +00001245 host = req.get_host()
1246 if not host:
1247 raise IOError, ('ftp error', 'no host given')
Martin v. Löwisa79449e2004-02-15 21:19:18 +00001248 host, port = splitport(host)
1249 if port is None:
1250 port = ftplib.FTP_PORT
Kurt B. Kaiser3f7cb5d2004-07-11 17:14:13 +00001251 else:
1252 port = int(port)
Martin v. Löwisa79449e2004-02-15 21:19:18 +00001253
1254 # username/password handling
1255 user, host = splituser(host)
1256 if user:
1257 user, passwd = splitpasswd(user)
1258 else:
1259 passwd = None
1260 host = unquote(host)
1261 user = unquote(user or '')
1262 passwd = unquote(passwd or '')
1263
Jeremy Hylton73574ee2000-10-12 18:54:18 +00001264 try:
1265 host = socket.gethostbyname(host)
Guido van Rossumb940e112007-01-10 16:19:56 +00001266 except socket.error as msg:
Jeremy Hylton73574ee2000-10-12 18:54:18 +00001267 raise URLError(msg)
Fred Drake13a2c272000-02-10 17:17:14 +00001268 path, attrs = splitattr(req.get_selector())
Eric S. Raymondb08b2d32001-02-09 11:10:16 +00001269 dirs = path.split('/')
Martin v. Löwis7db04e72004-02-15 20:51:39 +00001270 dirs = map(unquote, dirs)
Fred Drake13a2c272000-02-10 17:17:14 +00001271 dirs, file = dirs[:-1], dirs[-1]
1272 if dirs and not dirs[0]:
1273 dirs = dirs[1:]
Fred Drake13a2c272000-02-10 17:17:14 +00001274 try:
1275 fw = self.connect_ftp(user, passwd, host, port, dirs)
1276 type = file and 'I' or 'D'
1277 for attr in attrs:
Kurt B. Kaiser3f7cb5d2004-07-11 17:14:13 +00001278 attr, value = splitvalue(attr)
Eric S. Raymondb08b2d32001-02-09 11:10:16 +00001279 if attr.lower() == 'type' and \
Fred Drake13a2c272000-02-10 17:17:14 +00001280 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +00001281 type = value.upper()
Fred Drake13a2c272000-02-10 17:17:14 +00001282 fp, retrlen = fw.retrfile(file, type)
Guido van Rossum833a8d82001-08-24 13:10:13 +00001283 headers = ""
1284 mtype = mimetypes.guess_type(req.get_full_url())[0]
1285 if mtype:
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001286 headers += "Content-type: %s\n" % mtype
Fred Drake13a2c272000-02-10 17:17:14 +00001287 if retrlen is not None and retrlen >= 0:
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001288 headers += "Content-length: %d\n" % retrlen
Guido van Rossum833a8d82001-08-24 13:10:13 +00001289 sf = StringIO(headers)
1290 headers = mimetools.Message(sf)
Fred Drake13a2c272000-02-10 17:17:14 +00001291 return addinfourl(fp, headers, req.get_full_url())
Guido van Rossumb940e112007-01-10 16:19:56 +00001292 except ftplib.all_errors as msg:
Fred Drake13a2c272000-02-10 17:17:14 +00001293 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001294
1295 def connect_ftp(self, user, passwd, host, port, dirs):
1296 fw = ftpwrapper(user, passwd, host, port, dirs)
1297## fw.ftp.set_debuglevel(1)
1298 return fw
1299
1300class CacheFTPHandler(FTPHandler):
1301 # XXX would be nice to have pluggable cache strategies
1302 # XXX this stuff is definitely not thread safe
1303 def __init__(self):
1304 self.cache = {}
1305 self.timeout = {}
1306 self.soonest = 0
1307 self.delay = 60
Fred Drake13a2c272000-02-10 17:17:14 +00001308 self.max_conns = 16
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001309
1310 def setTimeout(self, t):
1311 self.delay = t
1312
1313 def setMaxConns(self, m):
Fred Drake13a2c272000-02-10 17:17:14 +00001314 self.max_conns = m
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001315
1316 def connect_ftp(self, user, passwd, host, port, dirs):
Mark Hammondc533c982004-05-10 07:35:33 +00001317 key = user, host, port, '/'.join(dirs)
Raymond Hettinger54f02222002-06-01 14:18:47 +00001318 if key in self.cache:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001319 self.timeout[key] = time.time() + self.delay
1320 else:
1321 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1322 self.timeout[key] = time.time() + self.delay
Fred Drake13a2c272000-02-10 17:17:14 +00001323 self.check_cache()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001324 return self.cache[key]
1325
1326 def check_cache(self):
Fred Drake13a2c272000-02-10 17:17:14 +00001327 # first check for old ones
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001328 t = time.time()
1329 if self.soonest <= t:
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001330 for k, v in list(self.timeout.items()):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001331 if v < t:
1332 self.cache[k].close()
1333 del self.cache[k]
1334 del self.timeout[k]
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001335 self.soonest = min(list(self.timeout.values()))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001336
1337 # then check the size
Fred Drake13a2c272000-02-10 17:17:14 +00001338 if len(self.cache) == self.max_conns:
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001339 for k, v in list(self.timeout.items()):
Fred Drake13a2c272000-02-10 17:17:14 +00001340 if v == self.soonest:
1341 del self.cache[k]
1342 del self.timeout[k]
1343 break
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001344 self.soonest = min(list(self.timeout.values()))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001345
1346class GopherHandler(BaseHandler):
1347 def gopher_open(self, req):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001348 # XXX can raise socket.error
Tim Peterse2c9a6c2006-02-18 04:14:16 +00001349 import gopherlib # this raises DeprecationWarning in 2.5
Fred Drake13a2c272000-02-10 17:17:14 +00001350 host = req.get_host()
1351 if not host:
1352 raise GopherError('no host given')
1353 host = unquote(host)
1354 selector = req.get_selector()
1355 type, selector = splitgophertype(selector)
1356 selector, query = splitquery(selector)
1357 selector = unquote(selector)
1358 if query:
1359 query = unquote(query)
1360 fp = gopherlib.send_query(selector, query, host)
1361 else:
1362 fp = gopherlib.send_selector(selector, host)
1363 return addinfourl(fp, noheaders(), req.get_full_url())