blob: 9277b1d1cdc89cb9d99892b6dc783b90b5c7ce7e [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""An extensible library for opening URLs using a variety of protocols
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00002
3The simplest way to use this module is to call the urlopen function,
Tim Peterse1190062001-01-15 03:34:38 +00004which accepts a string containing a URL or a Request object (described
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00005below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
Jeremy Hyltone1906632002-10-11 17:27:55 +00008The OpenerDirector manages a collection of Handler objects that do
Tim Peterse1190062001-01-15 03:34:38 +00009all the actual work. Each Handler implements a particular protocol or
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000010option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
Raymond Hettinger024aaa12003-04-24 15:32:12 +000014HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000016
Facundo Batistaca90ca82007-03-05 16:31:54 +000017urlopen(url, data=None) -- Basic usage is the same as original
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000018urllib. pass the url and optionally data to post to an HTTP URL, and
Tim Peterse1190062001-01-15 03:34:38 +000019get a file-like object back. One difference is that you can also pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000020a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
Facundo Batistaca90ca82007-03-05 16:31:54 +000024build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000026arguments, either instances or Handler classes that it will
Facundo Batistaca90ca82007-03-05 16:31:54 +000027instantiate. If one of the argument is a subclass of the default
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000028handler, the argument will be installed instead of the default.
29
Facundo Batistaca90ca82007-03-05 16:31:54 +000030install_opener -- Installs a new opener as the default opener.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000031
32objects of interest:
Senthil Kumaran51200272009-11-15 06:10:30 +000033
34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000036
Facundo Batistaca90ca82007-03-05 16:31:54 +000037Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000039headers, e.g. a User-Agent.
40
41BaseHandler --
42
43exceptions:
Facundo Batistaca90ca82007-03-05 16:31:54 +000044URLError -- A subclass of IOError, individual protocols have their own
45specific subclass.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000046
Facundo Batistaca90ca82007-03-05 16:31:54 +000047HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
48as an exceptional event or valid response.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000049
50internals:
51BaseHandler and parent
52_call_chain conventions
53
54Example usage:
55
56import urllib2
57
58# set up authentication info
59authinfo = urllib2.HTTPBasicAuthHandler()
Neal Norwitz8eea9ac2007-04-24 04:53:12 +000060authinfo.add_password(realm='PDQ Application',
61 uri='https://mahler:8092/site-updates.py',
62 user='klem',
63 passwd='geheim$parole')
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000064
Moshe Zadka8a18e992001-03-01 08:40:42 +000065proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
66
Tim Peterse1190062001-01-15 03:34:38 +000067# build a new opener that adds authentication and caching FTP handlers
Moshe Zadka8a18e992001-03-01 08:40:42 +000068opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000069
70# install it
71urllib2.install_opener(opener)
72
73f = urllib2.urlopen('http://www.python.org/')
74
75
76"""
77
78# XXX issues:
79# If an authentication error handler that tries to perform
Fred Draked5214b02001-11-08 17:19:29 +000080# authentication for some reason but fails, how should the error be
81# signalled? The client needs to know the HTTP error code. But if
82# the handler knows that the problem was, e.g., that it didn't know
83# that hash algo that requested in the challenge, it would be good to
84# pass that information along to the client, too.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000085# ftp errors aren't handled cleanly
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000086# check digest against correct (i.e. non-apache) implementation
87
Georg Brandlc5ffd912006-04-02 20:48:11 +000088# Possible extensions:
89# complex proxies XXX not sure what exactly was meant by this
90# abstract factory for opener
91
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +000092import base64
Georg Brandlbffb0bc2006-04-30 08:57:35 +000093import hashlib
Georg Brandl9d6da3e2006-05-17 15:17:00 +000094import httplib
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +000095import mimetools
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +000096import os
97import posixpath
98import random
99import re
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000100import socket
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000101import sys
102import time
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000103import urlparse
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000104import bisect
Senthil Kumaranb0d85fd2012-05-15 23:59:19 +0800105import warnings
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000106
107try:
108 from cStringIO import StringIO
109except ImportError:
110 from StringIO import StringIO
111
Benjamin Petersonfcfb18e2014-11-23 11:42:45 -0600112# check for SSL
113try:
114 import ssl
115except ImportError:
116 _have_ssl = False
117else:
118 _have_ssl = True
119
Georg Brandl7fff58c2006-04-02 21:13:13 +0000120from urllib import (unwrap, unquote, splittype, splithost, quote,
Senthil Kumaran01fe5fa2012-07-07 17:37:53 -0700121 addinfourl, splitport, splittag, toBytes,
Brett Cannon88f801d2008-08-18 00:46:22 +0000122 splitattr, ftpwrapper, splituser, splitpasswd, splitvalue)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000123
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000124# support for FileHandler, proxies via environment variables
Senthil Kumaran27468662009-10-11 02:00:07 +0000125from urllib import localhost, url2pathname, getproxies, proxy_bypass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000126
Georg Brandl720096a2006-04-02 20:45:34 +0000127# used in User-Agent header sent
128__version__ = sys.version[:3]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000129
130_opener = None
Benjamin Petersonfcfb18e2014-11-23 11:42:45 -0600131def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
132 cafile=None, capath=None, cadefault=False, context=None):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000133 global _opener
Benjamin Petersonfcfb18e2014-11-23 11:42:45 -0600134 if cafile or capath or cadefault:
135 if context is not None:
136 raise ValueError(
137 "You can't pass both context and any of cafile, capath, and "
138 "cadefault"
139 )
140 if not _have_ssl:
141 raise ValueError('SSL support not available')
Benjamin Peterson227f6e02014-12-07 13:41:26 -0500142 context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH,
Benjamin Petersonfcfb18e2014-11-23 11:42:45 -0600143 cafile=cafile,
144 capath=capath)
Benjamin Peterson227f6e02014-12-07 13:41:26 -0500145 https_handler = HTTPSHandler(context=context)
Benjamin Petersonfcfb18e2014-11-23 11:42:45 -0600146 opener = build_opener(https_handler)
147 elif context:
148 https_handler = HTTPSHandler(context=context)
149 opener = build_opener(https_handler)
150 elif _opener is None:
151 _opener = opener = build_opener()
152 else:
153 opener = _opener
154 return opener.open(url, data, timeout)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000155
156def install_opener(opener):
157 global _opener
158 _opener = opener
159
160# do these error classes make sense?
Tim Peterse1190062001-01-15 03:34:38 +0000161# make sure all of the IOError stuff is overridden. we just want to be
Fred Drakea87a5212002-08-13 13:59:55 +0000162# subtypes.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000163
164class URLError(IOError):
165 # URLError is a sub-type of IOError, but it doesn't share any of
Jeremy Hylton0a4a50d2003-10-06 05:15:13 +0000166 # the implementation. need to override __init__ and __str__.
167 # It sets self.args for compatibility with other EnvironmentError
168 # subclasses, but args doesn't have the typical format with errno in
169 # slot 0 and strerror in slot 1. This may be better than nothing.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000170 def __init__(self, reason):
Jeremy Hylton0a4a50d2003-10-06 05:15:13 +0000171 self.args = reason,
Fred Drake13a2c272000-02-10 17:17:14 +0000172 self.reason = reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000173
174 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000175 return '<urlopen error %s>' % self.reason
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000176
177class HTTPError(URLError, addinfourl):
178 """Raised when HTTP error occurs, but also acts like non-error return"""
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000179 __super_init = addinfourl.__init__
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000180
181 def __init__(self, url, code, msg, hdrs, fp):
Fred Drake13a2c272000-02-10 17:17:14 +0000182 self.code = code
183 self.msg = msg
184 self.hdrs = hdrs
185 self.fp = fp
Fred Drake13a2c272000-02-10 17:17:14 +0000186 self.filename = url
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000187 # The addinfourl classes depend on fp being a valid file
188 # object. In some cases, the HTTPError may not have a valid
189 # file object. If this happens, the simplest workaround is to
Tim Petersc411dba2002-07-16 21:35:23 +0000190 # not initialize the base classes.
Jeremy Hylton40bbae32002-06-03 16:53:00 +0000191 if fp is not None:
Georg Brandl99bb5f32008-04-09 17:57:38 +0000192 self.__super_init(fp, hdrs, url, code)
Tim Peterse1190062001-01-15 03:34:38 +0000193
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000194 def __str__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000195 return 'HTTP Error %s: %s' % (self.code, self.msg)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000196
Jason R. Coombs974d8632011-11-07 10:44:25 -0500197 # since URLError specifies a .reason attribute, HTTPError should also
198 # provide this attribute. See issue13211 fo discussion.
199 @property
200 def reason(self):
201 return self.msg
202
Senthil Kumaranf8a6b002012-12-23 09:00:47 -0800203 def info(self):
204 return self.hdrs
205
Georg Brandl9d6da3e2006-05-17 15:17:00 +0000206# copied from cookielib.py
Neal Norwitzb678ce52006-05-18 06:51:46 +0000207_cut_port_re = re.compile(r":\d+$")
Georg Brandl9d6da3e2006-05-17 15:17:00 +0000208def request_host(request):
209 """Return request-host, as defined by RFC 2965.
210
211 Variation from RFC: returned value is lowercased, for convenient
212 comparison.
213
214 """
215 url = request.get_full_url()
216 host = urlparse.urlparse(url)[1]
217 if host == "":
218 host = request.get_header("Host", "")
219
220 # remove port, if present
Neal Norwitzb678ce52006-05-18 06:51:46 +0000221 host = _cut_port_re.sub("", host, 1)
Georg Brandl9d6da3e2006-05-17 15:17:00 +0000222 return host.lower()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000223
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000224class Request:
Moshe Zadka8a18e992001-03-01 08:40:42 +0000225
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000226 def __init__(self, url, data=None, headers={},
227 origin_req_host=None, unverifiable=False):
Fred Drake13a2c272000-02-10 17:17:14 +0000228 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Senthil Kumaran5d60e562012-07-08 02:20:27 -0700229 self.__original = unwrap(url)
Senthil Kumaran49c44082011-04-13 07:31:45 +0800230 self.__original, self.__fragment = splittag(self.__original)
Fred Drake13a2c272000-02-10 17:17:14 +0000231 self.type = None
232 # self.__r_type is what's left after doing the splittype
233 self.host = None
234 self.port = None
Senthil Kumarane266f252009-05-24 09:14:50 +0000235 self._tunnel_host = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000236 self.data = data
Fred Drake13a2c272000-02-10 17:17:14 +0000237 self.headers = {}
Brett Cannonc8b188a2003-05-17 19:51:26 +0000238 for key, value in headers.items():
Brett Cannon86503b12003-05-12 07:29:42 +0000239 self.add_header(key, value)
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000240 self.unredirected_hdrs = {}
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000241 if origin_req_host is None:
Georg Brandl9d6da3e2006-05-17 15:17:00 +0000242 origin_req_host = request_host(self)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000243 self.origin_req_host = origin_req_host
244 self.unverifiable = unverifiable
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000245
246 def __getattr__(self, attr):
Fred Drake13a2c272000-02-10 17:17:14 +0000247 # XXX this is a fallback mechanism to guard against these
Tim Peterse1190062001-01-15 03:34:38 +0000248 # methods getting called in a non-standard order. this may be
Fred Drake13a2c272000-02-10 17:17:14 +0000249 # too complicated and/or unnecessary.
250 # XXX should the __r_XXX attributes be public?
251 if attr[:12] == '_Request__r_':
252 name = attr[12:]
253 if hasattr(Request, 'get_' + name):
254 getattr(self, 'get_' + name)()
255 return getattr(self, attr)
256 raise AttributeError, attr
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000257
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000258 def get_method(self):
259 if self.has_data():
260 return "POST"
261 else:
262 return "GET"
263
Jeremy Hylton023518a2003-12-17 18:52:16 +0000264 # XXX these helper methods are lame
265
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000266 def add_data(self, data):
267 self.data = data
268
269 def has_data(self):
270 return self.data is not None
271
272 def get_data(self):
273 return self.data
274
275 def get_full_url(self):
Senthil Kumaran49c44082011-04-13 07:31:45 +0800276 if self.__fragment:
277 return '%s#%s' % (self.__original, self.__fragment)
278 else:
279 return self.__original
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000280
281 def get_type(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000282 if self.type is None:
283 self.type, self.__r_type = splittype(self.__original)
Jeremy Hylton78cae612001-05-09 15:49:24 +0000284 if self.type is None:
285 raise ValueError, "unknown url type: %s" % self.__original
Fred Drake13a2c272000-02-10 17:17:14 +0000286 return self.type
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000287
288 def get_host(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000289 if self.host is None:
290 self.host, self.__r_host = splithost(self.__r_type)
291 if self.host:
292 self.host = unquote(self.host)
293 return self.host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000294
295 def get_selector(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000296 return self.__r_host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000297
Moshe Zadka8a18e992001-03-01 08:40:42 +0000298 def set_proxy(self, host, type):
Senthil Kumarane266f252009-05-24 09:14:50 +0000299 if self.type == 'https' and not self._tunnel_host:
300 self._tunnel_host = self.host
301 else:
302 self.type = type
303 self.__r_host = self.__original
304
305 self.host = host
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000306
Facundo Batistaeb90b782008-08-16 14:44:07 +0000307 def has_proxy(self):
308 return self.__r_host == self.__original
309
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000310 def get_origin_req_host(self):
311 return self.origin_req_host
312
313 def is_unverifiable(self):
314 return self.unverifiable
315
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000316 def add_header(self, key, val):
Fred Drake13a2c272000-02-10 17:17:14 +0000317 # useful for something like authentication
Georg Brandl8c036cc2006-08-20 13:15:39 +0000318 self.headers[key.capitalize()] = val
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000319
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000320 def add_unredirected_header(self, key, val):
321 # will not be added to a redirected request
Georg Brandl8c036cc2006-08-20 13:15:39 +0000322 self.unredirected_hdrs[key.capitalize()] = val
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000323
324 def has_header(self, header_name):
Neal Norwitz1cdd3632004-06-07 03:49:50 +0000325 return (header_name in self.headers or
326 header_name in self.unredirected_hdrs)
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000327
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000328 def get_header(self, header_name, default=None):
329 return self.headers.get(
330 header_name,
331 self.unredirected_hdrs.get(header_name, default))
332
333 def header_items(self):
334 hdrs = self.unredirected_hdrs.copy()
335 hdrs.update(self.headers)
336 return hdrs.items()
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000337
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000338class OpenerDirector:
339 def __init__(self):
Georg Brandl8d457c72005-06-26 22:01:35 +0000340 client_version = "Python-urllib/%s" % __version__
Georg Brandl8c036cc2006-08-20 13:15:39 +0000341 self.addheaders = [('User-agent', client_version)]
R. David Murray14f66352010-12-23 19:50:56 +0000342 # self.handlers is retained only for backward compatibility
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000343 self.handlers = []
R. David Murray14f66352010-12-23 19:50:56 +0000344 # manage the individual handlers
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000345 self.handle_open = {}
346 self.handle_error = {}
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000347 self.process_response = {}
348 self.process_request = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000349
350 def add_handler(self, handler):
Georg Brandlf91149e2007-07-12 08:05:45 +0000351 if not hasattr(handler, "add_parent"):
352 raise TypeError("expected BaseHandler instance, got %r" %
353 type(handler))
354
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000355 added = False
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000356 for meth in dir(handler):
Georg Brandl261e2512006-05-29 20:52:54 +0000357 if meth in ["redirect_request", "do_open", "proxy_open"]:
358 # oops, coincidental match
359 continue
360
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000361 i = meth.find("_")
362 protocol = meth[:i]
363 condition = meth[i+1:]
364
365 if condition.startswith("error"):
Neal Norwitz1cdd3632004-06-07 03:49:50 +0000366 j = condition.find("_") + i + 1
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000367 kind = meth[j+1:]
368 try:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000369 kind = int(kind)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000370 except ValueError:
371 pass
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000372 lookup = self.handle_error.get(protocol, {})
373 self.handle_error[protocol] = lookup
374 elif condition == "open":
375 kind = protocol
Raymond Hettingerf7bf02d2005-02-05 14:37:06 +0000376 lookup = self.handle_open
377 elif condition == "response":
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000378 kind = protocol
Raymond Hettingerf7bf02d2005-02-05 14:37:06 +0000379 lookup = self.process_response
380 elif condition == "request":
381 kind = protocol
382 lookup = self.process_request
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000383 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000384 continue
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000385
386 handlers = lookup.setdefault(kind, [])
387 if handlers:
388 bisect.insort(handlers, handler)
389 else:
390 handlers.append(handler)
391 added = True
392
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000393 if added:
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000394 bisect.insort(self.handlers, handler)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000395 handler.add_parent(self)
Tim Peterse1190062001-01-15 03:34:38 +0000396
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000397 def close(self):
Jeremy Hyltondce391c2003-12-15 16:08:48 +0000398 # Only exists for backwards compatibility.
399 pass
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000400
401 def _call_chain(self, chain, kind, meth_name, *args):
Georg Brandlc5ffd912006-04-02 20:48:11 +0000402 # Handlers raise an exception if no one else should try to handle
403 # the request, or return None if they can't but another handler
404 # could. Otherwise, they return the response.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000405 handlers = chain.get(kind, ())
406 for handler in handlers:
407 func = getattr(handler, meth_name)
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000408
409 result = func(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000410 if result is not None:
411 return result
412
Facundo Batista4f1b1ed2008-05-29 16:39:26 +0000413 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
Fred Drake13a2c272000-02-10 17:17:14 +0000414 # accept a URL or a Request object
Walter Dörwald65230a22002-06-03 15:58:32 +0000415 if isinstance(fullurl, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000416 req = Request(fullurl, data)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000417 else:
418 req = fullurl
419 if data is not None:
420 req.add_data(data)
Tim Peterse1190062001-01-15 03:34:38 +0000421
Facundo Batista10951d52007-06-06 17:15:23 +0000422 req.timeout = timeout
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000423 protocol = req.get_type()
424
425 # pre-process request
426 meth_name = protocol+"_request"
427 for processor in self.process_request.get(protocol, []):
428 meth = getattr(processor, meth_name)
429 req = meth(req)
430
431 response = self._open(req, data)
432
433 # post-process response
434 meth_name = protocol+"_response"
435 for processor in self.process_response.get(protocol, []):
436 meth = getattr(processor, meth_name)
437 response = meth(req, response)
438
439 return response
440
441 def _open(self, req, data=None):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000442 result = self._call_chain(self.handle_open, 'default',
Tim Peterse1190062001-01-15 03:34:38 +0000443 'default_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000444 if result:
445 return result
446
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000447 protocol = req.get_type()
448 result = self._call_chain(self.handle_open, protocol, protocol +
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000449 '_open', req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000450 if result:
451 return result
452
453 return self._call_chain(self.handle_open, 'unknown',
454 'unknown_open', req)
455
456 def error(self, proto, *args):
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000457 if proto in ('http', 'https'):
Fred Draked5214b02001-11-08 17:19:29 +0000458 # XXX http[s] protocols are special-cased
459 dict = self.handle_error['http'] # https is not different than http
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000460 proto = args[2] # YUCK!
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000461 meth_name = 'http_error_%s' % proto
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000462 http_err = 1
463 orig_args = args
464 else:
465 dict = self.handle_error
466 meth_name = proto + '_error'
467 http_err = 0
468 args = (dict, proto, meth_name) + args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000469 result = self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000470 if result:
471 return result
472
473 if http_err:
474 args = (dict, 'default', 'http_error_default') + orig_args
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000475 return self._call_chain(*args)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000476
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000477# XXX probably also want an abstract factory that knows when it makes
478# sense to skip a superclass in favor of a subclass and when it might
479# make sense to include both
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000480
481def build_opener(*handlers):
482 """Create an opener object from a list of handlers.
483
484 The opener will use several default handlers, including support
Senthil Kumaran51200272009-11-15 06:10:30 +0000485 for HTTP, FTP and when applicable, HTTPS.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000486
487 If any of the handlers passed as arguments are subclasses of the
488 default handlers, the default handlers will not be used.
489 """
Georg Brandl9d6da3e2006-05-17 15:17:00 +0000490 import types
491 def isclass(obj):
Benjamin Peterson4bb96fe2009-02-12 04:17:04 +0000492 return isinstance(obj, (types.ClassType, type))
Tim Peterse1190062001-01-15 03:34:38 +0000493
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000494 opener = OpenerDirector()
495 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
496 HTTPDefaultErrorHandler, HTTPRedirectHandler,
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000497 FTPHandler, FileHandler, HTTPErrorProcessor]
Moshe Zadka8a18e992001-03-01 08:40:42 +0000498 if hasattr(httplib, 'HTTPS'):
499 default_classes.append(HTTPSHandler)
Amaury Forgeot d'Arc96865852008-04-22 21:14:41 +0000500 skip = set()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000501 for klass in default_classes:
502 for check in handlers:
Georg Brandl9d6da3e2006-05-17 15:17:00 +0000503 if isclass(check):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000504 if issubclass(check, klass):
Amaury Forgeot d'Arc96865852008-04-22 21:14:41 +0000505 skip.add(klass)
Jeremy Hylton8b78b992001-10-09 16:18:45 +0000506 elif isinstance(check, klass):
Amaury Forgeot d'Arc96865852008-04-22 21:14:41 +0000507 skip.add(klass)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000508 for klass in skip:
509 default_classes.remove(klass)
510
511 for klass in default_classes:
512 opener.add_handler(klass())
513
514 for h in handlers:
Georg Brandl9d6da3e2006-05-17 15:17:00 +0000515 if isclass(h):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000516 h = h()
517 opener.add_handler(h)
518 return opener
519
520class BaseHandler:
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000521 handler_order = 500
522
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000523 def add_parent(self, parent):
524 self.parent = parent
Tim Peters58eb11c2004-01-18 20:29:55 +0000525
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000526 def close(self):
Jeremy Hyltondce391c2003-12-15 16:08:48 +0000527 # Only exists for backwards compatibility
528 pass
Tim Peters58eb11c2004-01-18 20:29:55 +0000529
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000530 def __lt__(self, other):
531 if not hasattr(other, "handler_order"):
532 # Try to preserve the old behavior of having custom classes
533 # inserted after default ones (works only for custom user
534 # classes which are not aware of handler_order).
535 return True
536 return self.handler_order < other.handler_order
Tim Petersf545baa2003-06-15 23:26:30 +0000537
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000538
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000539class HTTPErrorProcessor(BaseHandler):
540 """Process HTTP error responses."""
541 handler_order = 1000 # after all other processing
542
543 def http_response(self, request, response):
544 code, msg, hdrs = response.code, response.msg, response.info()
545
Neal Norwitz0d4c06e2007-04-25 06:30:05 +0000546 # According to RFC 2616, "2xx" code indicates that the client's
Facundo Batista9fab9f12007-04-23 17:08:31 +0000547 # request was successfully received, understood, and accepted.
548 if not (200 <= code < 300):
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000549 response = self.parent.error(
550 'http', request, response, code, msg, hdrs)
551
552 return response
553
554 https_response = http_response
555
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000556class HTTPDefaultErrorHandler(BaseHandler):
557 def http_error_default(self, req, fp, code, msg, hdrs):
Fred Drake13a2c272000-02-10 17:17:14 +0000558 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000559
560class HTTPRedirectHandler(BaseHandler):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000561 # maximum number of redirections to any single URL
562 # this is needed because of the state that cookies introduce
563 max_repeats = 4
564 # maximum total number of redirections (regardless of URL) before
565 # assuming we're in a loop
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000566 max_redirections = 10
567
Jeremy Hylton03892952003-05-05 04:09:13 +0000568 def redirect_request(self, req, fp, code, msg, headers, newurl):
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000569 """Return a Request or None in response to a redirect.
570
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000571 This is called by the http_error_30x methods when a
572 redirection response is received. If a redirection should
573 take place, return a new Request to allow http_error_30x to
574 perform the redirect. Otherwise, raise HTTPError if no-one
575 else should try to handle this url. Return None if you can't
576 but another Handler might.
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000577 """
Jeremy Hylton828023b2003-05-04 23:44:49 +0000578 m = req.get_method()
579 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
Martin v. Löwis162f0812003-07-12 07:33:32 +0000580 or code in (301, 302, 303) and m == "POST"):
581 # Strictly (according to RFC 2616), 301 or 302 in response
582 # to a POST MUST NOT cause a redirection without confirmation
Jeremy Hylton828023b2003-05-04 23:44:49 +0000583 # from the user (of urllib2, in this case). In practice,
584 # essentially all clients do redirect in this case, so we
585 # do the same.
Georg Brandlddb84d72006-03-18 11:35:18 +0000586 # be conciliant with URIs containing a space
587 newurl = newurl.replace(' ', '%20')
Facundo Batista86371d62008-02-07 19:06:52 +0000588 newheaders = dict((k,v) for k,v in req.headers.items()
589 if k.lower() not in ("content-length", "content-type")
590 )
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000591 return Request(newurl,
Facundo Batista86371d62008-02-07 19:06:52 +0000592 headers=newheaders,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000593 origin_req_host=req.get_origin_req_host(),
594 unverifiable=True)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000595 else:
Martin v. Löwise3b67bc2003-06-14 05:51:25 +0000596 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000597
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000598 # Implementation note: To avoid the server sending us into an
599 # infinite loop, the request object needs to track what URLs we
600 # have already seen. Do this by adding a handler-specific
601 # attribute to the Request object.
602 def http_error_302(self, req, fp, code, msg, headers):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000603 # Some servers (incorrectly) return multiple Location headers
604 # (so probably same goes for URI). Use first header.
Raymond Hettinger54f02222002-06-01 14:18:47 +0000605 if 'location' in headers:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000606 newurl = headers.getheaders('location')[0]
Raymond Hettinger54f02222002-06-01 14:18:47 +0000607 elif 'uri' in headers:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000608 newurl = headers.getheaders('uri')[0]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000609 else:
610 return
Facundo Batista94f243a2008-08-17 03:38:39 +0000611
612 # fix a possible malformed URL
613 urlparts = urlparse.urlparse(newurl)
614 if not urlparts.path:
615 urlparts = list(urlparts)
616 urlparts[2] = "/"
617 newurl = urlparse.urlunparse(urlparts)
618
Jeremy Hylton73574ee2000-10-12 18:54:18 +0000619 newurl = urlparse.urljoin(req.get_full_url(), newurl)
620
guido@google.com60a4a902011-03-24 08:07:45 -0700621 # For security reasons we do not allow redirects to protocols
guido@google.com2bc23b82011-03-24 10:44:17 -0700622 # other than HTTP, HTTPS or FTP.
guido@google.com60a4a902011-03-24 08:07:45 -0700623 newurl_lower = newurl.lower()
624 if not (newurl_lower.startswith('http://') or
guido@google.com2bc23b82011-03-24 10:44:17 -0700625 newurl_lower.startswith('https://') or
626 newurl_lower.startswith('ftp://')):
guido@google.comf1509302011-03-28 13:47:01 -0700627 raise HTTPError(newurl, code,
628 msg + " - Redirection to url '%s' is not allowed" %
629 newurl,
630 headers, fp)
guido@google.com60a4a902011-03-24 08:07:45 -0700631
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000632 # XXX Probably want to forget about the state of the current
633 # request, although that might interact poorly with other
634 # handlers that also use handler-specific request attributes
Jeremy Hylton03892952003-05-05 04:09:13 +0000635 new = self.redirect_request(req, fp, code, msg, headers, newurl)
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000636 if new is None:
637 return
638
639 # loop detection
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000640 # .redirect_dict has a key url if url was previously visited.
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000641 if hasattr(req, 'redirect_dict'):
642 visited = new.redirect_dict = req.redirect_dict
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000643 if (visited.get(newurl, 0) >= self.max_repeats or
644 len(visited) >= self.max_redirections):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000645 raise HTTPError(req.get_full_url(), code,
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000646 self.inf_msg + msg, headers, fp)
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +0000647 else:
648 visited = new.redirect_dict = req.redirect_dict = {}
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000649 visited[newurl] = visited.get(newurl, 0) + 1
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000650
651 # Don't close the fp until we are sure that we won't use it
Tim Petersab9ba272001-08-09 21:40:30 +0000652 # with HTTPError.
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000653 fp.read()
654 fp.close()
655
Senthil Kumaran5fee4602009-07-19 02:43:43 +0000656 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000657
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000658 http_error_301 = http_error_303 = http_error_307 = http_error_302
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000659
Martin v. Löwis162f0812003-07-12 07:33:32 +0000660 inf_msg = "The HTTP server returned a redirect error that would " \
Thomas Wouters7e474022000-07-16 12:04:32 +0000661 "lead to an infinite loop.\n" \
Martin v. Löwis162f0812003-07-12 07:33:32 +0000662 "The last 30x error message was:\n"
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000663
Georg Brandl720096a2006-04-02 20:45:34 +0000664
665def _parse_proxy(proxy):
666 """Return (scheme, user, password, host/port) given a URL or an authority.
667
668 If a URL is supplied, it must have an authority (host:port) component.
669 According to RFC 3986, having an authority component means the URL must
670 have two slashes after the scheme:
671
672 >>> _parse_proxy('file:/ftp.example.com/')
673 Traceback (most recent call last):
674 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
675
676 The first three items of the returned tuple may be None.
677
678 Examples of authority parsing:
679
680 >>> _parse_proxy('proxy.example.com')
681 (None, None, None, 'proxy.example.com')
682 >>> _parse_proxy('proxy.example.com:3128')
683 (None, None, None, 'proxy.example.com:3128')
684
685 The authority component may optionally include userinfo (assumed to be
686 username:password):
687
688 >>> _parse_proxy('joe:password@proxy.example.com')
689 (None, 'joe', 'password', 'proxy.example.com')
690 >>> _parse_proxy('joe:password@proxy.example.com:3128')
691 (None, 'joe', 'password', 'proxy.example.com:3128')
692
693 Same examples, but with URLs instead:
694
695 >>> _parse_proxy('http://proxy.example.com/')
696 ('http', None, None, 'proxy.example.com')
697 >>> _parse_proxy('http://proxy.example.com:3128/')
698 ('http', None, None, 'proxy.example.com:3128')
699 >>> _parse_proxy('http://joe:password@proxy.example.com/')
700 ('http', 'joe', 'password', 'proxy.example.com')
701 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
702 ('http', 'joe', 'password', 'proxy.example.com:3128')
703
704 Everything after the authority is ignored:
705
706 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
707 ('ftp', 'joe', 'password', 'proxy.example.com')
708
709 Test for no trailing '/' case:
710
711 >>> _parse_proxy('http://joe:password@proxy.example.com')
712 ('http', 'joe', 'password', 'proxy.example.com')
713
714 """
Georg Brandl720096a2006-04-02 20:45:34 +0000715 scheme, r_scheme = splittype(proxy)
716 if not r_scheme.startswith("/"):
717 # authority
718 scheme = None
719 authority = proxy
720 else:
721 # URL
722 if not r_scheme.startswith("//"):
723 raise ValueError("proxy URL with no authority: %r" % proxy)
724 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
725 # and 3.3.), path is empty or starts with '/'
726 end = r_scheme.find("/", 2)
727 if end == -1:
728 end = None
729 authority = r_scheme[2:end]
730 userinfo, hostport = splituser(authority)
731 if userinfo is not None:
732 user, password = splitpasswd(userinfo)
733 else:
734 user = password = None
735 return scheme, user, password, hostport
736
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000737class ProxyHandler(BaseHandler):
Gustavo Niemeyer9556fba2003-06-07 17:53:08 +0000738 # Proxies must be in front
739 handler_order = 100
740
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000741 def __init__(self, proxies=None):
Fred Drake13a2c272000-02-10 17:17:14 +0000742 if proxies is None:
743 proxies = getproxies()
744 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
745 self.proxies = proxies
Brett Cannondf0d87a2003-05-18 02:25:07 +0000746 for type, url in proxies.items():
Tim Peterse1190062001-01-15 03:34:38 +0000747 setattr(self, '%s_open' % type,
Fred Drake13a2c272000-02-10 17:17:14 +0000748 lambda r, proxy=url, type=type, meth=self.proxy_open: \
749 meth(r, proxy, type))
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000750
751 def proxy_open(self, req, proxy, type):
Fred Drake13a2c272000-02-10 17:17:14 +0000752 orig_type = req.get_type()
Georg Brandl720096a2006-04-02 20:45:34 +0000753 proxy_type, user, password, hostport = _parse_proxy(proxy)
Senthil Kumaran27468662009-10-11 02:00:07 +0000754
Georg Brandl720096a2006-04-02 20:45:34 +0000755 if proxy_type is None:
756 proxy_type = orig_type
Senthil Kumaran27468662009-10-11 02:00:07 +0000757
758 if req.host and proxy_bypass(req.host):
759 return None
760
Georg Brandl531ceba2006-01-21 07:20:56 +0000761 if user and password:
Georg Brandl720096a2006-04-02 20:45:34 +0000762 user_pass = '%s:%s' % (unquote(user), unquote(password))
Andrew M. Kuchling872dba42006-10-27 17:11:23 +0000763 creds = base64.b64encode(user_pass).strip()
Georg Brandl8c036cc2006-08-20 13:15:39 +0000764 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl720096a2006-04-02 20:45:34 +0000765 hostport = unquote(hostport)
766 req.set_proxy(hostport, proxy_type)
Senthil Kumaran27468662009-10-11 02:00:07 +0000767
Senthil Kumarane266f252009-05-24 09:14:50 +0000768 if orig_type == proxy_type or orig_type == 'https':
Fred Drake13a2c272000-02-10 17:17:14 +0000769 # let other handlers take care of it
Fred Drake13a2c272000-02-10 17:17:14 +0000770 return None
771 else:
772 # need to start over, because the other handlers don't
773 # grok the proxy's URL type
Georg Brandl720096a2006-04-02 20:45:34 +0000774 # e.g. if we have a constructor arg proxies like so:
775 # {'http': 'ftp://proxy.example.com'}, we may end up turning
776 # a request for http://acme.example.com/a into one for
777 # ftp://proxy.example.com/a
Senthil Kumaran5fee4602009-07-19 02:43:43 +0000778 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000779
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000780class HTTPPasswordMgr:
Georg Brandlfa42bd72006-04-30 07:06:11 +0000781
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000782 def __init__(self):
Fred Drake13a2c272000-02-10 17:17:14 +0000783 self.passwd = {}
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000784
785 def add_password(self, realm, uri, user, passwd):
Fred Drake13a2c272000-02-10 17:17:14 +0000786 # uri could be a single URI or a sequence
Walter Dörwald65230a22002-06-03 15:58:32 +0000787 if isinstance(uri, basestring):
Fred Drake13a2c272000-02-10 17:17:14 +0000788 uri = [uri]
Raymond Hettinger54f02222002-06-01 14:18:47 +0000789 if not realm in self.passwd:
Fred Drake13a2c272000-02-10 17:17:14 +0000790 self.passwd[realm] = {}
Georg Brandl2b330372006-05-28 20:23:12 +0000791 for default_port in True, False:
792 reduced_uri = tuple(
793 [self.reduce_uri(u, default_port) for u in uri])
794 self.passwd[realm][reduced_uri] = (user, passwd)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000795
796 def find_user_password(self, realm, authuri):
Fred Drake13a2c272000-02-10 17:17:14 +0000797 domains = self.passwd.get(realm, {})
Georg Brandl2b330372006-05-28 20:23:12 +0000798 for default_port in True, False:
799 reduced_authuri = self.reduce_uri(authuri, default_port)
800 for uris, authinfo in domains.iteritems():
801 for uri in uris:
802 if self.is_suburi(uri, reduced_authuri):
803 return authinfo
Fred Drake13a2c272000-02-10 17:17:14 +0000804 return None, None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000805
Georg Brandl2b330372006-05-28 20:23:12 +0000806 def reduce_uri(self, uri, default_port=True):
807 """Accept authority or URI and extract only the authority and path."""
808 # note HTTP URLs do not have a userinfo component
Georg Brandlfa42bd72006-04-30 07:06:11 +0000809 parts = urlparse.urlsplit(uri)
Fred Drake13a2c272000-02-10 17:17:14 +0000810 if parts[1]:
Georg Brandlfa42bd72006-04-30 07:06:11 +0000811 # URI
Georg Brandl2b330372006-05-28 20:23:12 +0000812 scheme = parts[0]
813 authority = parts[1]
814 path = parts[2] or '/'
Fred Drake13a2c272000-02-10 17:17:14 +0000815 else:
Georg Brandl2b330372006-05-28 20:23:12 +0000816 # host or host:port
817 scheme = None
818 authority = uri
819 path = '/'
820 host, port = splitport(authority)
821 if default_port and port is None and scheme is not None:
822 dport = {"http": 80,
823 "https": 443,
824 }.get(scheme)
825 if dport is not None:
826 authority = "%s:%d" % (host, dport)
827 return authority, path
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000828
829 def is_suburi(self, base, test):
Fred Drake13a2c272000-02-10 17:17:14 +0000830 """Check if test is below base in a URI tree
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000831
Fred Drake13a2c272000-02-10 17:17:14 +0000832 Both args must be URIs in reduced form.
833 """
834 if base == test:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000835 return True
Fred Drake13a2c272000-02-10 17:17:14 +0000836 if base[0] != test[0]:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000837 return False
Moshe Zadka8a18e992001-03-01 08:40:42 +0000838 common = posixpath.commonprefix((base[1], test[1]))
Fred Drake13a2c272000-02-10 17:17:14 +0000839 if len(common) == len(base[1]):
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000840 return True
841 return False
Tim Peterse1190062001-01-15 03:34:38 +0000842
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000843
Moshe Zadka8a18e992001-03-01 08:40:42 +0000844class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
845
846 def find_user_password(self, realm, authuri):
Jeremy Hyltonaefae552003-07-10 13:30:12 +0000847 user, password = HTTPPasswordMgr.find_user_password(self, realm,
848 authuri)
Moshe Zadka8a18e992001-03-01 08:40:42 +0000849 if user is not None:
850 return user, password
851 return HTTPPasswordMgr.find_user_password(self, None, authuri)
852
853
854class AbstractBasicAuthHandler:
855
Georg Brandl172e7252007-03-07 07:39:06 +0000856 # XXX this allows for multiple auth-schemes, but will stupidly pick
857 # the last one with a realm specified.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000858
Georg Brandl33124322008-03-21 19:54:00 +0000859 # allow for double- and single-quoted realm values
860 # (single quotes are a violation of the RFC, but appear in the wild)
861 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
Senthil Kumaran6a2a6c22012-05-15 22:24:10 +0800862 'realm=(["\']?)([^"\']*)\\2', re.I)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000863
Georg Brandl261e2512006-05-29 20:52:54 +0000864 # XXX could pre-emptively send auth info already accepted (RFC 2617,
865 # end of section 2, and section 1.2 immediately after "credentials"
866 # production).
867
Moshe Zadka8a18e992001-03-01 08:40:42 +0000868 def __init__(self, password_mgr=None):
869 if password_mgr is None:
870 password_mgr = HTTPPasswordMgr()
871 self.passwd = password_mgr
Fred Drake13a2c272000-02-10 17:17:14 +0000872 self.add_password = self.passwd.add_password
Tim Peterse1190062001-01-15 03:34:38 +0000873
Senthil Kumaran4f1ba0d2010-08-19 17:32:03 +0000874
Moshe Zadka8a18e992001-03-01 08:40:42 +0000875 def http_error_auth_reqed(self, authreq, host, req, headers):
Georg Brandlfa42bd72006-04-30 07:06:11 +0000876 # host may be an authority (without userinfo) or a URL with an
877 # authority
Moshe Zadka8a18e992001-03-01 08:40:42 +0000878 # XXX could be multiple headers
879 authreq = headers.get(authreq, None)
Senthil Kumaran4f0108b2010-06-01 12:40:07 +0000880
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000881 if authreq:
Martin v. Löwis65a79752004-08-03 12:59:55 +0000882 mo = AbstractBasicAuthHandler.rx.search(authreq)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000883 if mo:
Georg Brandl33124322008-03-21 19:54:00 +0000884 scheme, quote, realm = mo.groups()
Senthil Kumaranb0d85fd2012-05-15 23:59:19 +0800885 if quote not in ['"', "'"]:
886 warnings.warn("Basic Auth Realm was unquoted",
887 UserWarning, 2)
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000888 if scheme.lower() == 'basic':
Senthil Kumaran0088b622014-08-20 07:52:59 +0530889 return self.retry_http_basic_auth(host, req, realm)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000890
Moshe Zadka8a18e992001-03-01 08:40:42 +0000891 def retry_http_basic_auth(self, host, req, realm):
Georg Brandlfa42bd72006-04-30 07:06:11 +0000892 user, pw = self.passwd.find_user_password(realm, host)
Martin v. Löwis8b3e8712004-05-06 01:41:26 +0000893 if pw is not None:
Fred Drake13a2c272000-02-10 17:17:14 +0000894 raw = "%s:%s" % (user, pw)
Andrew M. Kuchling872dba42006-10-27 17:11:23 +0000895 auth = 'Basic %s' % base64.b64encode(raw).strip()
Senthil Kumaran0088b622014-08-20 07:52:59 +0530896 if req.get_header(self.auth_header, None) == auth:
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000897 return None
Senthil Kumaran8526adf2010-02-24 16:45:46 +0000898 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaran5fee4602009-07-19 02:43:43 +0000899 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000900 else:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000901 return None
902
Georg Brandlfa42bd72006-04-30 07:06:11 +0000903
Moshe Zadka8a18e992001-03-01 08:40:42 +0000904class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000905
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000906 auth_header = 'Authorization'
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000907
Moshe Zadka8a18e992001-03-01 08:40:42 +0000908 def http_error_401(self, req, fp, code, msg, headers):
Georg Brandlfa42bd72006-04-30 07:06:11 +0000909 url = req.get_full_url()
Senthil Kumaran4f1ba0d2010-08-19 17:32:03 +0000910 response = self.http_error_auth_reqed('www-authenticate',
911 url, req, headers)
Senthil Kumaran4f1ba0d2010-08-19 17:32:03 +0000912 return response
Moshe Zadka8a18e992001-03-01 08:40:42 +0000913
914
915class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
916
Georg Brandl8c036cc2006-08-20 13:15:39 +0000917 auth_header = 'Proxy-authorization'
Moshe Zadka8a18e992001-03-01 08:40:42 +0000918
919 def http_error_407(self, req, fp, code, msg, headers):
Georg Brandlfa42bd72006-04-30 07:06:11 +0000920 # http_error_auth_reqed requires that there is no userinfo component in
921 # authority. Assume there isn't one, since urllib2 does not (and
922 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
923 # userinfo.
924 authority = req.get_host()
Senthil Kumaran4f1ba0d2010-08-19 17:32:03 +0000925 response = self.http_error_auth_reqed('proxy-authenticate',
Georg Brandlfa42bd72006-04-30 07:06:11 +0000926 authority, req, headers)
Senthil Kumaran4f1ba0d2010-08-19 17:32:03 +0000927 return response
Moshe Zadka8a18e992001-03-01 08:40:42 +0000928
929
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000930def randombytes(n):
931 """Return n random bytes."""
932 # Use /dev/urandom if it is available. Fall back to random module
933 # if not. It might be worthwhile to extend this function to use
934 # other platform-specific mechanisms for getting random bytes.
935 if os.path.exists("/dev/urandom"):
936 f = open("/dev/urandom")
937 s = f.read(n)
938 f.close()
939 return s
940 else:
941 L = [chr(random.randrange(0, 256)) for i in range(n)]
942 return "".join(L)
943
Moshe Zadka8a18e992001-03-01 08:40:42 +0000944class AbstractDigestAuthHandler:
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000945 # Digest authentication is specified in RFC 2617.
946
947 # XXX The client does not inspect the Authentication-Info header
948 # in a successful response.
949
950 # XXX It should be possible to test this implementation against
951 # a mock server that just generates a static set of challenges.
952
953 # XXX qop="auth-int" supports is shaky
Moshe Zadka8a18e992001-03-01 08:40:42 +0000954
955 def __init__(self, passwd=None):
956 if passwd is None:
Jeremy Hylton54e99e82001-08-07 21:12:25 +0000957 passwd = HTTPPasswordMgr()
Moshe Zadka8a18e992001-03-01 08:40:42 +0000958 self.passwd = passwd
Fred Drake13a2c272000-02-10 17:17:14 +0000959 self.add_password = self.passwd.add_password
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000960 self.retried = 0
961 self.nonce_count = 0
Senthil Kumaran20eb4f02009-11-15 08:36:20 +0000962 self.last_nonce = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000963
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000964 def reset_retry_count(self):
965 self.retried = 0
966
967 def http_error_auth_reqed(self, auth_header, host, req, headers):
968 authreq = headers.get(auth_header, None)
969 if self.retried > 5:
970 # Don't fail endlessly - if we failed once, we'll probably
971 # fail a second time. Hm. Unless the Password Manager is
972 # prompting for the information. Crap. This isn't great
973 # but it's better than the current 'repeat until recursion
974 # depth exceeded' approach <wink>
Tim Peters58eb11c2004-01-18 20:29:55 +0000975 raise HTTPError(req.get_full_url(), 401, "digest auth failed",
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000976 headers, None)
977 else:
978 self.retried += 1
Fred Drake13a2c272000-02-10 17:17:14 +0000979 if authreq:
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000980 scheme = authreq.split()[0]
981 if scheme.lower() == 'digest':
Fred Drake13a2c272000-02-10 17:17:14 +0000982 return self.retry_http_digest_auth(req, authreq)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000983
984 def retry_http_digest_auth(self, req, auth):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000985 token, challenge = auth.split(' ', 1)
Fred Drake13a2c272000-02-10 17:17:14 +0000986 chal = parse_keqv_list(parse_http_list(challenge))
987 auth = self.get_authorization(req, chal)
988 if auth:
Jeremy Hylton52a17be2001-11-09 16:46:51 +0000989 auth_val = 'Digest %s' % auth
990 if req.headers.get(self.auth_header, None) == auth_val:
991 return None
Georg Brandl852bb002006-05-03 05:05:02 +0000992 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaran5fee4602009-07-19 02:43:43 +0000993 resp = self.parent.open(req, timeout=req.timeout)
Fred Drake13a2c272000-02-10 17:17:14 +0000994 return resp
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +0000995
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +0000996 def get_cnonce(self, nonce):
997 # The cnonce-value is an opaque
998 # quoted string value provided by the client and used by both client
999 # and server to avoid chosen plaintext attacks, to provide mutual
1000 # authentication, and to provide some message integrity protection.
1001 # This isn't a fabulous effort, but it's probably Good Enough.
Georg Brandlbffb0bc2006-04-30 08:57:35 +00001002 dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
1003 randombytes(8))).hexdigest()
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +00001004 return dig[:16]
1005
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001006 def get_authorization(self, req, chal):
Fred Drake13a2c272000-02-10 17:17:14 +00001007 try:
1008 realm = chal['realm']
1009 nonce = chal['nonce']
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +00001010 qop = chal.get('qop')
Fred Drake13a2c272000-02-10 17:17:14 +00001011 algorithm = chal.get('algorithm', 'MD5')
1012 # mod_digest doesn't send an opaque, even though it isn't
1013 # supposed to be optional
1014 opaque = chal.get('opaque', None)
1015 except KeyError:
1016 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001017
Fred Drake13a2c272000-02-10 17:17:14 +00001018 H, KD = self.get_algorithm_impls(algorithm)
1019 if H is None:
1020 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001021
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +00001022 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
Fred Drake13a2c272000-02-10 17:17:14 +00001023 if user is None:
1024 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001025
Fred Drake13a2c272000-02-10 17:17:14 +00001026 # XXX not implemented yet
1027 if req.has_data():
1028 entdig = self.get_entity_digest(req.get_data(), chal)
1029 else:
1030 entdig = None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001031
Fred Drake13a2c272000-02-10 17:17:14 +00001032 A1 = "%s:%s:%s" % (user, realm, pw)
Johannes Gijsberscdd625a2005-01-09 05:51:49 +00001033 A2 = "%s:%s" % (req.get_method(),
Fred Drake13a2c272000-02-10 17:17:14 +00001034 # XXX selector: what about proxies and full urls
1035 req.get_selector())
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +00001036 if qop == 'auth':
Senthil Kumaran20eb4f02009-11-15 08:36:20 +00001037 if nonce == self.last_nonce:
1038 self.nonce_count += 1
1039 else:
1040 self.nonce_count = 1
1041 self.last_nonce = nonce
1042
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +00001043 ncvalue = '%08x' % self.nonce_count
1044 cnonce = self.get_cnonce(nonce)
1045 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1046 respdig = KD(H(A1), noncebit)
1047 elif qop is None:
1048 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1049 else:
1050 # XXX handle auth-int.
Georg Brandlff871222007-06-07 13:34:10 +00001051 raise URLError("qop '%s' is not supported." % qop)
Tim Peters58eb11c2004-01-18 20:29:55 +00001052
Fred Drake13a2c272000-02-10 17:17:14 +00001053 # XXX should the partial digests be encoded too?
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001054
Fred Drake13a2c272000-02-10 17:17:14 +00001055 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1056 'response="%s"' % (user, realm, nonce, req.get_selector(),
1057 respdig)
1058 if opaque:
Jeremy Hyltonb300ae32004-12-22 14:27:19 +00001059 base += ', opaque="%s"' % opaque
Fred Drake13a2c272000-02-10 17:17:14 +00001060 if entdig:
Jeremy Hyltonb300ae32004-12-22 14:27:19 +00001061 base += ', digest="%s"' % entdig
1062 base += ', algorithm="%s"' % algorithm
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +00001063 if qop:
Jeremy Hyltonb300ae32004-12-22 14:27:19 +00001064 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
Fred Drake13a2c272000-02-10 17:17:14 +00001065 return base
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001066
1067 def get_algorithm_impls(self, algorithm):
Georg Brandl8d66dcd2008-05-04 21:40:44 +00001068 # algorithm should be case-insensitive according to RFC2617
1069 algorithm = algorithm.upper()
Fred Drake13a2c272000-02-10 17:17:14 +00001070 # lambdas assume digest modules are imported at the top level
1071 if algorithm == 'MD5':
Georg Brandlbffb0bc2006-04-30 08:57:35 +00001072 H = lambda x: hashlib.md5(x).hexdigest()
Fred Drake13a2c272000-02-10 17:17:14 +00001073 elif algorithm == 'SHA':
Georg Brandlbffb0bc2006-04-30 08:57:35 +00001074 H = lambda x: hashlib.sha1(x).hexdigest()
Fred Drake13a2c272000-02-10 17:17:14 +00001075 # XXX MD5-sess
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +00001076 KD = lambda s, d: H("%s:%s" % (s, d))
Fred Drake13a2c272000-02-10 17:17:14 +00001077 return H, KD
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001078
1079 def get_entity_digest(self, data, chal):
Fred Drake13a2c272000-02-10 17:17:14 +00001080 # XXX not implemented yet
1081 return None
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001082
Moshe Zadka8a18e992001-03-01 08:40:42 +00001083
1084class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1085 """An authentication protocol defined by RFC 2069
1086
1087 Digest authentication improves on basic authentication because it
1088 does not transmit passwords in the clear.
1089 """
1090
Jeremy Hyltonaefae552003-07-10 13:30:12 +00001091 auth_header = 'Authorization'
Georg Brandl261e2512006-05-29 20:52:54 +00001092 handler_order = 490 # before Basic auth
Moshe Zadka8a18e992001-03-01 08:40:42 +00001093
1094 def http_error_401(self, req, fp, code, msg, headers):
1095 host = urlparse.urlparse(req.get_full_url())[1]
Tim Peters58eb11c2004-01-18 20:29:55 +00001096 retry = self.http_error_auth_reqed('www-authenticate',
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +00001097 host, req, headers)
1098 self.reset_retry_count()
1099 return retry
Moshe Zadka8a18e992001-03-01 08:40:42 +00001100
1101
1102class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1103
Jeremy Hyltonaefae552003-07-10 13:30:12 +00001104 auth_header = 'Proxy-Authorization'
Georg Brandl261e2512006-05-29 20:52:54 +00001105 handler_order = 490 # before Basic auth
Moshe Zadka8a18e992001-03-01 08:40:42 +00001106
1107 def http_error_407(self, req, fp, code, msg, headers):
1108 host = req.get_host()
Tim Peters58eb11c2004-01-18 20:29:55 +00001109 retry = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hyltonfcefd0d2003-10-21 18:07:07 +00001110 host, req, headers)
1111 self.reset_retry_count()
1112 return retry
Tim Peterse1190062001-01-15 03:34:38 +00001113
Moshe Zadka8a18e992001-03-01 08:40:42 +00001114class AbstractHTTPHandler(BaseHandler):
1115
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001116 def __init__(self, debuglevel=0):
1117 self._debuglevel = debuglevel
1118
1119 def set_http_debuglevel(self, level):
1120 self._debuglevel = level
1121
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001122 def do_request_(self, request):
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001123 host = request.get_host()
1124 if not host:
1125 raise URLError('no host given')
1126
1127 if request.has_data(): # POST
1128 data = request.get_data()
Georg Brandl8c036cc2006-08-20 13:15:39 +00001129 if not request.has_header('Content-type'):
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001130 request.add_unredirected_header(
Georg Brandl8c036cc2006-08-20 13:15:39 +00001131 'Content-type',
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001132 'application/x-www-form-urlencoded')
Georg Brandl8c036cc2006-08-20 13:15:39 +00001133 if not request.has_header('Content-length'):
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001134 request.add_unredirected_header(
Georg Brandl8c036cc2006-08-20 13:15:39 +00001135 'Content-length', '%d' % len(data))
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001136
Facundo Batistaeb90b782008-08-16 14:44:07 +00001137 sel_host = host
1138 if request.has_proxy():
1139 scheme, sel = splittype(request.get_selector())
1140 sel_host, sel_path = splithost(sel)
1141
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001142 if not request.has_header('Host'):
Facundo Batistaeb90b782008-08-16 14:44:07 +00001143 request.add_unredirected_header('Host', sel_host)
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001144 for name, value in self.parent.addheaders:
Georg Brandl8c036cc2006-08-20 13:15:39 +00001145 name = name.capitalize()
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001146 if not request.has_header(name):
1147 request.add_unredirected_header(name, value)
1148
1149 return request
1150
Benjamin Petersonfcfb18e2014-11-23 11:42:45 -06001151 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton023518a2003-12-17 18:52:16 +00001152 """Return an addinfourl object for the request, using http_class.
1153
1154 http_class must implement the HTTPConnection API from httplib.
1155 The addinfourl return value is a file-like object. It also
1156 has methods and attributes including:
1157 - info(): return a mimetools.Message object for the headers
1158 - geturl(): return the original request URL
1159 - code: HTTP status code
1160 """
Moshe Zadka76676802001-04-11 07:44:53 +00001161 host = req.get_host()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001162 if not host:
1163 raise URLError('no host given')
1164
Benjamin Petersonfcfb18e2014-11-23 11:42:45 -06001165 # will parse host:port
1166 h = http_class(host, timeout=req.timeout, **http_conn_args)
Jeremy Hyltonc1be59f2003-12-14 05:27:34 +00001167 h.set_debuglevel(self._debuglevel)
Tim Peterse1190062001-01-15 03:34:38 +00001168
Senthil Kumaran176c73d2010-09-27 01:40:59 +00001169 headers = dict(req.unredirected_hdrs)
1170 headers.update(dict((k, v) for k, v in req.headers.items()
1171 if k not in headers))
1172
Jeremy Hyltonb3ee6f92004-02-24 19:40:35 +00001173 # We want to make an HTTP/1.1 request, but the addinfourl
1174 # class isn't prepared to deal with a persistent connection.
1175 # It will try to read all remaining data from the socket,
1176 # which will block while the server waits for the next request.
1177 # So make sure the connection gets closed after the (only)
1178 # request.
1179 headers["Connection"] = "close"
Georg Brandl8c036cc2006-08-20 13:15:39 +00001180 headers = dict(
1181 (name.title(), val) for name, val in headers.items())
Senthil Kumarane266f252009-05-24 09:14:50 +00001182
1183 if req._tunnel_host:
Senthil Kumaran7713acf2009-12-20 06:05:13 +00001184 tunnel_headers = {}
1185 proxy_auth_hdr = "Proxy-Authorization"
1186 if proxy_auth_hdr in headers:
1187 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1188 # Proxy-Authorization should not be sent to origin
1189 # server.
1190 del headers[proxy_auth_hdr]
1191 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumarane266f252009-05-24 09:14:50 +00001192
Jeremy Hylton828023b2003-05-04 23:44:49 +00001193 try:
Jeremy Hylton023518a2003-12-17 18:52:16 +00001194 h.request(req.get_method(), req.get_selector(), req.data, headers)
Senthil Kumaran7d7702b2011-07-27 09:37:17 +08001195 except socket.error, err: # XXX what error?
1196 h.close()
1197 raise URLError(err)
1198 else:
Kristján Valur Jónsson3c43fcb2009-01-11 16:23:37 +00001199 try:
1200 r = h.getresponse(buffering=True)
Senthil Kumaran7d7702b2011-07-27 09:37:17 +08001201 except TypeError: # buffering kw not supported
Kristján Valur Jónsson3c43fcb2009-01-11 16:23:37 +00001202 r = h.getresponse()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001203
Andrew M. Kuchlingf9ea7c02004-07-10 15:34:34 +00001204 # Pick apart the HTTPResponse object to get the addinfourl
Jeremy Hylton5d9c3032004-08-07 17:40:50 +00001205 # object initialized properly.
1206
1207 # Wrap the HTTPResponse object in socket's file object adapter
1208 # for Windows. That adapter calls recv(), so delegate recv()
1209 # to read(). This weird wrapping allows the returned object to
1210 # have readline() and readlines() methods.
Tim Peters9ca3f852004-08-08 01:05:14 +00001211
Jeremy Hylton5d9c3032004-08-07 17:40:50 +00001212 # XXX It might be better to extract the read buffering code
1213 # out of socket._fileobject() and into a base class.
Tim Peters9ca3f852004-08-08 01:05:14 +00001214
Jeremy Hylton5d9c3032004-08-07 17:40:50 +00001215 r.recv = r.read
Georg Brandldd7b0522007-01-21 10:35:10 +00001216 fp = socket._fileobject(r, close=True)
Tim Peters9ca3f852004-08-08 01:05:14 +00001217
Jeremy Hylton5d9c3032004-08-07 17:40:50 +00001218 resp = addinfourl(fp, r.msg, req.get_full_url())
Andrew M. Kuchlingf9ea7c02004-07-10 15:34:34 +00001219 resp.code = r.status
1220 resp.msg = r.reason
1221 return resp
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001222
Moshe Zadka8a18e992001-03-01 08:40:42 +00001223
1224class HTTPHandler(AbstractHTTPHandler):
1225
1226 def http_open(self, req):
Jeremy Hylton023518a2003-12-17 18:52:16 +00001227 return self.do_open(httplib.HTTPConnection, req)
Moshe Zadka8a18e992001-03-01 08:40:42 +00001228
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001229 http_request = AbstractHTTPHandler.do_request_
Moshe Zadka8a18e992001-03-01 08:40:42 +00001230
1231if hasattr(httplib, 'HTTPS'):
1232 class HTTPSHandler(AbstractHTTPHandler):
1233
Benjamin Peterson227f6e02014-12-07 13:41:26 -05001234 def __init__(self, debuglevel=0, context=None):
Benjamin Petersonfcfb18e2014-11-23 11:42:45 -06001235 AbstractHTTPHandler.__init__(self, debuglevel)
1236 self._context = context
Benjamin Petersonfcfb18e2014-11-23 11:42:45 -06001237
Moshe Zadka8a18e992001-03-01 08:40:42 +00001238 def https_open(self, req):
Benjamin Petersonfcfb18e2014-11-23 11:42:45 -06001239 return self.do_open(httplib.HTTPSConnection, req,
Benjamin Peterson227f6e02014-12-07 13:41:26 -05001240 context=self._context)
Moshe Zadka8a18e992001-03-01 08:40:42 +00001241
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001242 https_request = AbstractHTTPHandler.do_request_
1243
1244class HTTPCookieProcessor(BaseHandler):
1245 def __init__(self, cookiejar=None):
Georg Brandl9d6da3e2006-05-17 15:17:00 +00001246 import cookielib
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001247 if cookiejar is None:
Neal Norwitz1cdd3632004-06-07 03:49:50 +00001248 cookiejar = cookielib.CookieJar()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001249 self.cookiejar = cookiejar
1250
1251 def http_request(self, request):
1252 self.cookiejar.add_cookie_header(request)
1253 return request
1254
1255 def http_response(self, request, response):
1256 self.cookiejar.extract_cookies(response, request)
1257 return response
1258
1259 https_request = http_request
1260 https_response = http_response
Moshe Zadka8a18e992001-03-01 08:40:42 +00001261
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001262class UnknownHandler(BaseHandler):
1263 def unknown_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +00001264 type = req.get_type()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001265 raise URLError('unknown url type: %s' % type)
1266
1267def parse_keqv_list(l):
1268 """Parse list of key=value strings where keys are not duplicated."""
1269 parsed = {}
1270 for elt in l:
Eric S. Raymondb08b2d32001-02-09 11:10:16 +00001271 k, v = elt.split('=', 1)
Fred Drake13a2c272000-02-10 17:17:14 +00001272 if v[0] == '"' and v[-1] == '"':
1273 v = v[1:-1]
1274 parsed[k] = v
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001275 return parsed
1276
1277def parse_http_list(s):
1278 """Parse lists as described by RFC 2068 Section 2.
Tim Peters9e34c042005-08-26 15:20:46 +00001279
Andrew M. Kuchling22ab06e2004-04-06 19:43:03 +00001280 In particular, parse comma-separated lists where the elements of
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001281 the list may include quoted-strings. A quoted-string could
Georg Brandle1b13d22005-08-24 22:20:32 +00001282 contain a comma. A non-quoted string could have quotes in the
1283 middle. Neither commas nor quotes count if they are escaped.
1284 Only double-quotes count, not single-quotes.
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001285 """
Georg Brandle1b13d22005-08-24 22:20:32 +00001286 res = []
1287 part = ''
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001288
Georg Brandle1b13d22005-08-24 22:20:32 +00001289 escape = quote = False
1290 for cur in s:
1291 if escape:
1292 part += cur
1293 escape = False
1294 continue
1295 if quote:
1296 if cur == '\\':
1297 escape = True
Fred Drake13a2c272000-02-10 17:17:14 +00001298 continue
Georg Brandle1b13d22005-08-24 22:20:32 +00001299 elif cur == '"':
1300 quote = False
1301 part += cur
1302 continue
1303
1304 if cur == ',':
1305 res.append(part)
1306 part = ''
1307 continue
1308
1309 if cur == '"':
1310 quote = True
Tim Peters9e34c042005-08-26 15:20:46 +00001311
Georg Brandle1b13d22005-08-24 22:20:32 +00001312 part += cur
1313
1314 # append last part
1315 if part:
1316 res.append(part)
1317
1318 return [part.strip() for part in res]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001319
Senthil Kumaran7cc0fe42010-08-11 18:18:22 +00001320def _safe_gethostbyname(host):
1321 try:
1322 return socket.gethostbyname(host)
1323 except socket.gaierror:
1324 return None
1325
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001326class FileHandler(BaseHandler):
1327 # Use local file or FTP depending on form of URL
1328 def file_open(self, req):
Fred Drake13a2c272000-02-10 17:17:14 +00001329 url = req.get_selector()
Senthil Kumaran87ed31a2010-07-11 03:18:51 +00001330 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1331 req.host != 'localhost'):
Fred Drake13a2c272000-02-10 17:17:14 +00001332 req.type = 'ftp'
1333 return self.parent.open(req)
1334 else:
1335 return self.open_local_file(req)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001336
1337 # names for the localhost
1338 names = None
1339 def get_names(self):
Fred Drake13a2c272000-02-10 17:17:14 +00001340 if FileHandler.names is None:
Georg Brandl4eb521e2006-04-02 20:37:17 +00001341 try:
Senthil Kumaran13c2ef92009-12-27 09:11:09 +00001342 FileHandler.names = tuple(
1343 socket.gethostbyname_ex('localhost')[2] +
1344 socket.gethostbyname_ex(socket.gethostname())[2])
Georg Brandl4eb521e2006-04-02 20:37:17 +00001345 except socket.gaierror:
1346 FileHandler.names = (socket.gethostbyname('localhost'),)
Fred Drake13a2c272000-02-10 17:17:14 +00001347 return FileHandler.names
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001348
1349 # not entirely sure what the rules are here
1350 def open_local_file(self, req):
Georg Brandl5a096e12007-01-22 19:40:21 +00001351 import email.utils
Georg Brandl9d6da3e2006-05-17 15:17:00 +00001352 import mimetypes
Fred Drake13a2c272000-02-10 17:17:14 +00001353 host = req.get_host()
Senthil Kumaran18e4dd72010-05-08 05:00:11 +00001354 filename = req.get_selector()
1355 localfile = url2pathname(filename)
Georg Brandlceede5c2007-03-13 08:14:27 +00001356 try:
1357 stats = os.stat(localfile)
1358 size = stats.st_size
1359 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran18e4dd72010-05-08 05:00:11 +00001360 mtype = mimetypes.guess_type(filename)[0]
Georg Brandlceede5c2007-03-13 08:14:27 +00001361 headers = mimetools.Message(StringIO(
1362 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1363 (mtype or 'text/plain', size, modified)))
1364 if host:
1365 host, port = splitport(host)
1366 if not host or \
Senthil Kumaran7cc0fe42010-08-11 18:18:22 +00001367 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran18e4dd72010-05-08 05:00:11 +00001368 if host:
1369 origurl = 'file://' + host + filename
1370 else:
1371 origurl = 'file://' + filename
1372 return addinfourl(open(localfile, 'rb'), headers, origurl)
Georg Brandlceede5c2007-03-13 08:14:27 +00001373 except OSError, msg:
1374 # urllib2 users shouldn't expect OSErrors coming from urlopen()
1375 raise URLError(msg)
Fred Drake13a2c272000-02-10 17:17:14 +00001376 raise URLError('file not on local host')
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001377
1378class FTPHandler(BaseHandler):
1379 def ftp_open(self, req):
Georg Brandl9d6da3e2006-05-17 15:17:00 +00001380 import ftplib
1381 import mimetypes
Fred Drake13a2c272000-02-10 17:17:14 +00001382 host = req.get_host()
1383 if not host:
Neal Norwitz70700942008-01-24 07:40:51 +00001384 raise URLError('ftp error: no host given')
Martin v. Löwisa79449e2004-02-15 21:19:18 +00001385 host, port = splitport(host)
1386 if port is None:
1387 port = ftplib.FTP_PORT
Kurt B. Kaiser3f7cb5d2004-07-11 17:14:13 +00001388 else:
1389 port = int(port)
Martin v. Löwisa79449e2004-02-15 21:19:18 +00001390
1391 # username/password handling
1392 user, host = splituser(host)
1393 if user:
1394 user, passwd = splitpasswd(user)
1395 else:
1396 passwd = None
1397 host = unquote(host)
Senthil Kumaran9fce5512010-11-20 11:24:08 +00001398 user = user or ''
1399 passwd = passwd or ''
Martin v. Löwisa79449e2004-02-15 21:19:18 +00001400
Jeremy Hylton73574ee2000-10-12 18:54:18 +00001401 try:
1402 host = socket.gethostbyname(host)
1403 except socket.error, msg:
1404 raise URLError(msg)
Fred Drake13a2c272000-02-10 17:17:14 +00001405 path, attrs = splitattr(req.get_selector())
Eric S. Raymondb08b2d32001-02-09 11:10:16 +00001406 dirs = path.split('/')
Martin v. Löwis7db04e72004-02-15 20:51:39 +00001407 dirs = map(unquote, dirs)
Fred Drake13a2c272000-02-10 17:17:14 +00001408 dirs, file = dirs[:-1], dirs[-1]
1409 if dirs and not dirs[0]:
1410 dirs = dirs[1:]
Fred Drake13a2c272000-02-10 17:17:14 +00001411 try:
Facundo Batista10951d52007-06-06 17:15:23 +00001412 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
Fred Drake13a2c272000-02-10 17:17:14 +00001413 type = file and 'I' or 'D'
1414 for attr in attrs:
Kurt B. Kaiser3f7cb5d2004-07-11 17:14:13 +00001415 attr, value = splitvalue(attr)
Eric S. Raymondb08b2d32001-02-09 11:10:16 +00001416 if attr.lower() == 'type' and \
Fred Drake13a2c272000-02-10 17:17:14 +00001417 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Eric S. Raymondb08b2d32001-02-09 11:10:16 +00001418 type = value.upper()
Fred Drake13a2c272000-02-10 17:17:14 +00001419 fp, retrlen = fw.retrfile(file, type)
Guido van Rossum833a8d82001-08-24 13:10:13 +00001420 headers = ""
1421 mtype = mimetypes.guess_type(req.get_full_url())[0]
1422 if mtype:
Georg Brandl8c036cc2006-08-20 13:15:39 +00001423 headers += "Content-type: %s\n" % mtype
Fred Drake13a2c272000-02-10 17:17:14 +00001424 if retrlen is not None and retrlen >= 0:
Georg Brandl8c036cc2006-08-20 13:15:39 +00001425 headers += "Content-length: %d\n" % retrlen
Guido van Rossum833a8d82001-08-24 13:10:13 +00001426 sf = StringIO(headers)
1427 headers = mimetools.Message(sf)
Fred Drake13a2c272000-02-10 17:17:14 +00001428 return addinfourl(fp, headers, req.get_full_url())
1429 except ftplib.all_errors, msg:
Neal Norwitz70700942008-01-24 07:40:51 +00001430 raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001431
Facundo Batista10951d52007-06-06 17:15:23 +00001432 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
Nadeem Vawdab42c53e2011-07-23 15:51:16 +02001433 fw = ftpwrapper(user, passwd, host, port, dirs, timeout,
1434 persistent=False)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001435## fw.ftp.set_debuglevel(1)
1436 return fw
1437
1438class CacheFTPHandler(FTPHandler):
1439 # XXX would be nice to have pluggable cache strategies
1440 # XXX this stuff is definitely not thread safe
1441 def __init__(self):
1442 self.cache = {}
1443 self.timeout = {}
1444 self.soonest = 0
1445 self.delay = 60
Fred Drake13a2c272000-02-10 17:17:14 +00001446 self.max_conns = 16
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001447
1448 def setTimeout(self, t):
1449 self.delay = t
1450
1451 def setMaxConns(self, m):
Fred Drake13a2c272000-02-10 17:17:14 +00001452 self.max_conns = m
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001453
Facundo Batista10951d52007-06-06 17:15:23 +00001454 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1455 key = user, host, port, '/'.join(dirs), timeout
Raymond Hettinger54f02222002-06-01 14:18:47 +00001456 if key in self.cache:
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001457 self.timeout[key] = time.time() + self.delay
1458 else:
Facundo Batista10951d52007-06-06 17:15:23 +00001459 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout)
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001460 self.timeout[key] = time.time() + self.delay
Fred Drake13a2c272000-02-10 17:17:14 +00001461 self.check_cache()
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001462 return self.cache[key]
1463
1464 def check_cache(self):
Fred Drake13a2c272000-02-10 17:17:14 +00001465 # first check for old ones
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001466 t = time.time()
1467 if self.soonest <= t:
Raymond Hettinger4ec4fa22003-05-23 08:51:51 +00001468 for k, v in self.timeout.items():
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001469 if v < t:
1470 self.cache[k].close()
1471 del self.cache[k]
1472 del self.timeout[k]
1473 self.soonest = min(self.timeout.values())
1474
1475 # then check the size
Fred Drake13a2c272000-02-10 17:17:14 +00001476 if len(self.cache) == self.max_conns:
Brett Cannonc8b188a2003-05-17 19:51:26 +00001477 for k, v in self.timeout.items():
Fred Drake13a2c272000-02-10 17:17:14 +00001478 if v == self.soonest:
1479 del self.cache[k]
1480 del self.timeout[k]
1481 break
1482 self.soonest = min(self.timeout.values())
Nadeem Vawdab42c53e2011-07-23 15:51:16 +02001483
1484 def clear_cache(self):
1485 for conn in self.cache.values():
1486 conn.close()
1487 self.cache.clear()
1488 self.timeout.clear()