blob: c55dee8ab3ac662ff28da1791362fadaec5d30a6 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Jeremy Hylton5f22af12007-08-16 17:55:18 +000025import httplib
Jack Jansendc3e3f61995-12-15 13:22:13 +000026import os
Jeremy Hylton5f22af12007-08-16 17:55:18 +000027import socket
Guido van Rossum3c8484e1996-11-20 22:02:24 +000028import sys
Jeremy Hylton5f22af12007-08-16 17:55:18 +000029import time
Brett Cannon69200fa2004-03-23 21:26:39 +000030from urlparse import urljoin as basejoin
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000031
Skip Montanaro40fc1602001-03-01 04:27:19 +000032__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
Skip Montanaro44d5e0c2001-03-13 19:47:16 +000034 "urlencode", "url2pathname", "pathname2url", "splittag",
35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37 "splitnport", "splitquery", "splitattr", "splitvalue",
Guido van Rossumd59da4b2007-05-22 18:11:13 +000038 "getproxies"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000039
Martin v. Löwis3e865952006-01-24 15:51:21 +000040__version__ = '1.17' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000041
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000042MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000043
Jack Jansendc3e3f61995-12-15 13:22:13 +000044# Helper for non-unix systems
45if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000046 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000047elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000048 from nturl2path import url2pathname, pathname2url
Guido van Rossumd74fb6b2001-03-02 06:43:49 +000049elif os.name == 'riscos':
50 from rourl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000051else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000052 def url2pathname(pathname):
Georg Brandlc0b24732005-12-26 22:53:56 +000053 """OS-specific conversion from a relative URL of the 'file' scheme
54 to a file system path; not recommended for general use."""
Guido van Rossum367ac801999-03-12 14:31:10 +000055 return unquote(pathname)
Georg Brandlc0b24732005-12-26 22:53:56 +000056
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000057 def pathname2url(pathname):
Georg Brandlc0b24732005-12-26 22:53:56 +000058 """OS-specific conversion from a file system path to a relative URL
59 of the 'file' scheme; not recommended for general use."""
Guido van Rossum367ac801999-03-12 14:31:10 +000060 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000061
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000062# This really consists of two pieces:
63# (1) a class which handles opening of all sorts of URLs
64# (plus assorted utilities etc.)
65# (2) a set of functions for parsing URLs
66# XXX Should these be separated out into different modules?
67
68
69# Shortcut for basic usage
70_urlopener = None
Fred Drakedf6eca72002-04-04 20:41:34 +000071def urlopen(url, data=None, proxies=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000072 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000073 global _urlopener
Fred Drakedf6eca72002-04-04 20:41:34 +000074 if proxies is not None:
75 opener = FancyURLopener(proxies=proxies)
76 elif not _urlopener:
77 opener = FancyURLopener()
78 _urlopener = opener
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000079 else:
Fred Drakedf6eca72002-04-04 20:41:34 +000080 opener = _urlopener
81 if data is None:
82 return opener.open(url)
83 else:
84 return opener.open(url, data)
Jeremy Hylton39b198d2007-08-04 19:22:00 +000085
Fred Drake316a7932000-08-24 01:01:26 +000086def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000087 global _urlopener
88 if not _urlopener:
89 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000090 return _urlopener.retrieve(url, filename, reporthook, data)
Jeremy Hylton39b198d2007-08-04 19:22:00 +000091
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000092def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000093 if _urlopener:
94 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000095
Georg Brandlb9256022005-08-24 18:46:39 +000096# exception raised when downloaded size does not match content-length
97class ContentTooShortError(IOError):
98 def __init__(self, message, content):
99 IOError.__init__(self, message)
100 self.content = content
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000101
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000102ftpcache = {}
103class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000104 """Class to open URLs.
105 This is a class rather than just a subroutine because we may need
106 more than one set of global protocol-specific options.
107 Note -- this is a base class for those who don't want the
108 automatic handling of errors type 302 (relocated) and 401
109 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000110
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000111 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +0000112
Guido van Rossumba311382000-08-24 16:18:04 +0000113 version = "Python-urllib/%s" % __version__
114
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000115 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000116 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000117 if proxies is None:
118 proxies = getproxies()
Guido van Rossume2b70bc2006-08-18 22:13:04 +0000119 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000120 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000121 self.key_file = x509.get('key_file')
122 self.cert_file = x509.get('cert_file')
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000123 self.addheaders = [('User-Agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000124 self.__tempfiles = []
125 self.__unlink = os.unlink # See cleanup()
126 self.tempcache = None
127 # Undocumented feature: if you assign {} to tempcache,
128 # it is used to cache files retrieved with
129 # self.retrieve(). This is not enabled by default
130 # since it does not work for changing documents (and I
131 # haven't got the logic to check expiration headers
132 # yet).
133 self.ftpcache = ftpcache
134 # Undocumented feature: you can use a different
135 # ftp cache by assigning to the .ftpcache member;
136 # in case you want logically independent URL openers
137 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000138
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000139 def __del__(self):
140 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000141
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000142 def close(self):
143 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000144
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000145 def cleanup(self):
146 # This code sometimes runs when the rest of this module
147 # has already been deleted, so it can't use any globals
148 # or import anything.
149 if self.__tempfiles:
150 for file in self.__tempfiles:
151 try:
152 self.__unlink(file)
Martin v. Löwis58682b72001-08-11 15:02:57 +0000153 except OSError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000154 pass
155 del self.__tempfiles[:]
156 if self.tempcache:
157 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000158
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000159 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000160 """Add a header to be used by the HTTP interface only
161 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000162 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000163
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000164 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000165 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000166 """Use URLopener().open(file) instead of open(file, 'r')."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000167 fullurl = unwrap(toBytes(fullurl))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000168 if self.tempcache and fullurl in self.tempcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000169 filename, headers = self.tempcache[fullurl]
170 fp = open(filename, 'rb')
171 return addinfourl(fp, headers, fullurl)
Martin v. Löwis1d994332000-12-03 18:30:10 +0000172 urltype, url = splittype(fullurl)
173 if not urltype:
174 urltype = 'file'
Raymond Hettinger54f02222002-06-01 14:18:47 +0000175 if urltype in self.proxies:
Martin v. Löwis1d994332000-12-03 18:30:10 +0000176 proxy = self.proxies[urltype]
177 urltype, proxyhost = splittype(proxy)
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000178 host, selector = splithost(proxyhost)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000179 url = (host, fullurl) # Signal special case to open_*()
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000180 else:
181 proxy = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000182 name = 'open_' + urltype
183 self.type = urltype
Brett Cannonaaeffaf2004-03-23 23:50:17 +0000184 name = name.replace('-', '_')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000185 if not hasattr(self, name):
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000186 if proxy:
187 return self.open_unknown_proxy(proxy, fullurl, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000188 else:
189 return self.open_unknown(fullurl, data)
190 try:
191 if data is None:
192 return getattr(self, name)(url)
193 else:
194 return getattr(self, name)(url, data)
Guido van Rossumb940e112007-01-10 16:19:56 +0000195 except socket.error as msg:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000196 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000197
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000198 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000199 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000200 type, url = splittype(fullurl)
201 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000202
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000203 def open_unknown_proxy(self, proxy, fullurl, data=None):
204 """Overridable interface to open unknown URL type."""
205 type, url = splittype(fullurl)
206 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
207
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000208 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000209 def retrieve(self, url, filename=None, reporthook=None, data=None):
Brett Cannon7d618c72003-04-24 02:43:20 +0000210 """retrieve(url) returns (filename, headers) for a local object
Guido van Rossume7b146f2000-02-04 15:28:42 +0000211 or (tempfilename, headers) for a remote object."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000212 url = unwrap(toBytes(url))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000213 if self.tempcache and url in self.tempcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000214 return self.tempcache[url]
215 type, url1 = splittype(url)
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000216 if filename is None and (not type or type == 'file'):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000217 try:
218 fp = self.open_local_file(url1)
219 hdrs = fp.info()
220 del fp
221 return url2pathname(splithost(url1)[1]), hdrs
Guido van Rossumb940e112007-01-10 16:19:56 +0000222 except IOError as msg:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000223 pass
Fred Drake316a7932000-08-24 01:01:26 +0000224 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000225 headers = fp.info()
Guido van Rossum3b0a3292002-08-09 16:38:32 +0000226 if filename:
227 tfp = open(filename, 'wb')
228 else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000229 import tempfile
230 garbage, path = splittype(url)
231 garbage, path = splithost(path or "")
232 path, garbage = splitquery(path or "")
233 path, garbage = splitattr(path or "")
234 suffix = os.path.splitext(path)[1]
Guido van Rossum3b0a3292002-08-09 16:38:32 +0000235 (fd, filename) = tempfile.mkstemp(suffix)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000236 self.__tempfiles.append(filename)
Jeremy Hylton3bd6fde2002-10-11 14:36:24 +0000237 tfp = os.fdopen(fd, 'wb')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000238 result = filename, headers
239 if self.tempcache is not None:
240 self.tempcache[url] = result
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000241 bs = 1024*8
242 size = -1
Georg Brandlb9256022005-08-24 18:46:39 +0000243 read = 0
Georg Brandl5a650a22005-08-26 08:51:34 +0000244 blocknum = 0
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000245 if reporthook:
Raymond Hettinger54f02222002-06-01 14:18:47 +0000246 if "content-length" in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000247 size = int(headers["Content-Length"])
Georg Brandl5a650a22005-08-26 08:51:34 +0000248 reporthook(blocknum, bs, size)
249 while 1:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000250 block = fp.read(bs)
Guido van Rossuma0982942007-07-10 08:30:03 +0000251 if not block:
Georg Brandl5a650a22005-08-26 08:51:34 +0000252 break
Georg Brandlb9256022005-08-24 18:46:39 +0000253 read += len(block)
Georg Brandl5a650a22005-08-26 08:51:34 +0000254 tfp.write(block)
Georg Brandlb9256022005-08-24 18:46:39 +0000255 blocknum += 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000256 if reporthook:
257 reporthook(blocknum, bs, size)
258 fp.close()
259 tfp.close()
260 del fp
261 del tfp
Georg Brandlb9256022005-08-24 18:46:39 +0000262
263 # raise exception if actual size does not match content-length header
264 if size >= 0 and read < size:
265 raise ContentTooShortError("retrieval incomplete: got only %i out "
266 "of %i bytes" % (read, size), result)
267
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000268 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000269
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000270 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000271
Jeremy Hylton5f22af12007-08-16 17:55:18 +0000272 def _open_generic_http(self, connection_factory, url, data):
273 """Make an HTTP connection using connection_class.
274
275 This is an internal method that should be called from
276 open_http() or open_https().
277
278 Arguments:
279 - connection_factory should take a host name and return an
280 HTTPConnection instance.
281 - url is the url to retrieval or a host, relative-path pair.
282 - data is payload for a POST request or None.
283 """
284
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000285 user_passwd = None
Martin v. Löwis3e865952006-01-24 15:51:21 +0000286 proxy_passwd= None
Walter Dörwald65230a22002-06-03 15:58:32 +0000287 if isinstance(url, str):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000288 host, selector = splithost(url)
289 if host:
290 user_passwd, host = splituser(host)
291 host = unquote(host)
292 realhost = host
293 else:
294 host, selector = url
Martin v. Löwis3e865952006-01-24 15:51:21 +0000295 # check whether the proxy contains authorization information
296 proxy_passwd, host = splituser(host)
297 # now we proceed with the url we want to obtain
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000298 urltype, rest = splittype(selector)
299 url = rest
300 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000301 if urltype.lower() != 'http':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000302 realhost = None
303 else:
304 realhost, rest = splithost(rest)
305 if realhost:
306 user_passwd, realhost = splituser(realhost)
307 if user_passwd:
308 selector = "%s://%s%s" % (urltype, realhost, rest)
Tim Peters55c12d42001-08-09 18:04:14 +0000309 if proxy_bypass(realhost):
310 host = realhost
311
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000312 #print "proxy via http:", host, selector
313 if not host: raise IOError, ('http error', 'no host given')
Tim Peters92037a12006-01-24 22:44:08 +0000314
Martin v. Löwis3e865952006-01-24 15:51:21 +0000315 if proxy_passwd:
316 import base64
Thomas Wouters89f507f2006-12-13 04:49:30 +0000317 proxy_auth = base64.b64encode(proxy_passwd).strip()
Martin v. Löwis3e865952006-01-24 15:51:21 +0000318 else:
319 proxy_auth = None
320
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000321 if user_passwd:
322 import base64
Thomas Wouters89f507f2006-12-13 04:49:30 +0000323 auth = base64.b64encode(user_passwd).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000324 else:
325 auth = None
Jeremy Hylton5f22af12007-08-16 17:55:18 +0000326 http_conn = connection_factory(host)
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000327 # XXX We should fix urllib so that it works with HTTP/1.1.
328 http_conn._http_vsn = 10
329 http_conn._http_vsn_str = "HTTP/1.0"
330
331 headers = {}
332 if proxy_auth:
333 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
334 if auth:
335 headers["Authorization"] = "Basic %s" % auth
336 if realhost:
337 headers["Host"] = realhost
338 for header, value in self.addheaders:
339 headers[header] = value
340
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000341 if data is not None:
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000342 headers["Content-Type"] = "application/x-www-form-urlencoded"
343 http_conn.request("POST", selector, data, headers)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000344 else:
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000345 http_conn.request("GET", selector, headers=headers)
346
347 try:
348 response = http_conn.getresponse()
349 except httplib.BadStatusLine:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000350 # something went wrong with the HTTP status line
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000351 raise IOError('http protocol error', 0,
352 'got a bad status line', None)
353
354 if response.status == 200:
355 return addinfourl(response.fp, response.msg, "http:" + url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000356 else:
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000357 return self.http_error(
358 url, response.fp,
359 response.status, response.reason, response.msg, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000360
Jeremy Hylton5f22af12007-08-16 17:55:18 +0000361 def open_http(self, url, data=None):
362 """Use HTTP protocol."""
363 return self._open_generic_http(httplib.HTTPConnection, url, data)
364
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000365 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000366 """Handle http errors.
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000367
Guido van Rossume7b146f2000-02-04 15:28:42 +0000368 Derived class can override this, or provide specific handlers
369 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000370 # First check if there's a specific handler for this error
371 name = 'http_error_%d' % errcode
372 if hasattr(self, name):
373 method = getattr(self, name)
374 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000375 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000376 else:
377 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000378 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000379 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000380
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000381 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000382 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000383 void = fp.read()
384 fp.close()
385 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000386
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000387 if hasattr(socket, "ssl"):
Jeremy Hylton5f22af12007-08-16 17:55:18 +0000388 def _https_connection(self, host):
389 return httplib.HTTPSConnection(host,
390 key_file=self.key_file,
391 cert_file=self.cert_file)
392
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000393 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000394 """Use HTTPS protocol."""
Jeremy Hylton5f22af12007-08-16 17:55:18 +0000395 return self._open_generic_http(self._https_connection, url, data)
Fred Drake567ca8e2000-08-21 21:42:42 +0000396
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000397 def open_file(self, url):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000398 """Use local file or FTP depending on form of URL."""
Martin v. Löwis3e865952006-01-24 15:51:21 +0000399 if not isinstance(url, str):
400 raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
Jack Jansen4ef11032002-09-12 20:14:04 +0000401 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000402 return self.open_ftp(url)
403 else:
404 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000405
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000406 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000407 """Use local file."""
Thomas Woutersb2137042007-02-01 18:02:27 +0000408 import mimetypes, mimetools, email.utils
Guido van Rossum68937b42007-05-18 00:51:22 +0000409 from io import StringIO
Guido van Rossumf0713d32001-08-09 17:43:35 +0000410 host, file = splithost(url)
411 localname = url2pathname(file)
Guido van Rossuma2da3052002-04-15 00:25:01 +0000412 try:
413 stats = os.stat(localname)
Guido van Rossumb940e112007-01-10 16:19:56 +0000414 except OSError as e:
Guido van Rossuma2da3052002-04-15 00:25:01 +0000415 raise IOError(e.errno, e.strerror, e.filename)
Walter Dörwald92b48b72002-03-22 17:30:38 +0000416 size = stats.st_size
Thomas Woutersb2137042007-02-01 18:02:27 +0000417 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000418 mtype = mimetypes.guess_type(url)[0]
Raymond Hettingera6172712004-12-31 19:15:26 +0000419 headers = mimetools.Message(StringIO(
Guido van Rossumf0713d32001-08-09 17:43:35 +0000420 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
421 (mtype or 'text/plain', size, modified)))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000422 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000423 urlfile = file
424 if file[:1] == '/':
425 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000426 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000427 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000428 host, port = splitport(host)
429 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000430 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000431 urlfile = file
432 if file[:1] == '/':
433 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000434 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000435 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000436 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000437
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000438 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000439 """Use FTP protocol."""
Martin v. Löwis3e865952006-01-24 15:51:21 +0000440 if not isinstance(url, str):
441 raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
Raymond Hettingera6172712004-12-31 19:15:26 +0000442 import mimetypes, mimetools
Guido van Rossum68937b42007-05-18 00:51:22 +0000443 from io import StringIO
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000444 host, path = splithost(url)
445 if not host: raise IOError, ('ftp error', 'no host given')
446 host, port = splitport(host)
447 user, host = splituser(host)
448 if user: user, passwd = splitpasswd(user)
449 else: passwd = None
450 host = unquote(host)
451 user = unquote(user or '')
452 passwd = unquote(passwd or '')
453 host = socket.gethostbyname(host)
454 if not port:
455 import ftplib
456 port = ftplib.FTP_PORT
457 else:
458 port = int(port)
459 path, attrs = splitattr(path)
460 path = unquote(path)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000461 dirs = path.split('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000462 dirs, file = dirs[:-1], dirs[-1]
463 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000464 if dirs and not dirs[0]: dirs[0] = '/'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000465 key = user, host, port, '/'.join(dirs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000466 # XXX thread unsafe!
467 if len(self.ftpcache) > MAXFTPCACHE:
468 # Prune the cache, rather arbitrarily
469 for k in self.ftpcache.keys():
470 if k != key:
471 v = self.ftpcache[k]
472 del self.ftpcache[k]
473 v.close()
474 try:
Raymond Hettinger54f02222002-06-01 14:18:47 +0000475 if not key in self.ftpcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000476 self.ftpcache[key] = \
477 ftpwrapper(user, passwd, host, port, dirs)
478 if not file: type = 'D'
479 else: type = 'I'
480 for attr in attrs:
481 attr, value = splitvalue(attr)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000482 if attr.lower() == 'type' and \
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000483 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000484 type = value.upper()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000485 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000486 mtype = mimetypes.guess_type("ftp:" + url)[0]
487 headers = ""
488 if mtype:
489 headers += "Content-Type: %s\n" % mtype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000490 if retrlen is not None and retrlen >= 0:
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000491 headers += "Content-Length: %d\n" % retrlen
Raymond Hettingera6172712004-12-31 19:15:26 +0000492 headers = mimetools.Message(StringIO(headers))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000493 return addinfourl(fp, headers, "ftp:" + url)
Guido van Rossumb940e112007-01-10 16:19:56 +0000494 except ftperrors() as msg:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000495 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000496
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000497 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000498 """Use "data" URL."""
Martin v. Löwis3e865952006-01-24 15:51:21 +0000499 if not isinstance(url, str):
500 raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000501 # ignore POSTed data
502 #
503 # syntax of data URLs:
504 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
505 # mediatype := [ type "/" subtype ] *( ";" parameter )
506 # data := *urlchar
507 # parameter := attribute "=" value
Raymond Hettingera6172712004-12-31 19:15:26 +0000508 import mimetools
Guido van Rossum68937b42007-05-18 00:51:22 +0000509 from io import StringIO
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000510 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000511 [type, data] = url.split(',', 1)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000512 except ValueError:
513 raise IOError, ('data error', 'bad data URL')
514 if not type:
515 type = 'text/plain;charset=US-ASCII'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000516 semi = type.rfind(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000517 if semi >= 0 and '=' not in type[semi:]:
518 encoding = type[semi+1:]
519 type = type[:semi]
520 else:
521 encoding = ''
522 msg = []
523 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
524 time.gmtime(time.time())))
525 msg.append('Content-type: %s' % type)
526 if encoding == 'base64':
527 import base64
528 data = base64.decodestring(data)
529 else:
530 data = unquote(data)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000531 msg.append('Content-Length: %d' % len(data))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000532 msg.append('')
533 msg.append(data)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000534 msg = '\n'.join(msg)
Raymond Hettingera6172712004-12-31 19:15:26 +0000535 f = StringIO(msg)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000536 headers = mimetools.Message(f, 0)
Georg Brandl1f663572005-11-26 16:50:44 +0000537 #f.fileno = None # needed for addinfourl
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000538 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000539
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000540
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000541class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000542 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000543
Neal Norwitz60e04cd2002-06-11 13:38:51 +0000544 def __init__(self, *args, **kwargs):
Guido van Rossum68468eb2003-02-27 20:14:51 +0000545 URLopener.__init__(self, *args, **kwargs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000546 self.auth_cache = {}
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000547 self.tries = 0
548 self.maxtries = 10
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000549
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000550 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000551 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000552 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000553
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000554 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000555 """Error 302 -- relocated (temporarily)."""
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000556 self.tries += 1
557 if self.maxtries and self.tries >= self.maxtries:
558 if hasattr(self, "http_error_500"):
559 meth = self.http_error_500
560 else:
561 meth = self.http_error_default
562 self.tries = 0
563 return meth(url, fp, 500,
564 "Internal Server Error: Redirect Recursion", headers)
565 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
566 data)
567 self.tries = 0
568 return result
569
570 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000571 if 'location' in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000572 newurl = headers['location']
Raymond Hettinger54f02222002-06-01 14:18:47 +0000573 elif 'uri' in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000574 newurl = headers['uri']
575 else:
576 return
577 void = fp.read()
578 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000579 # In case the server sent a relative URL, join with original:
Moshe Zadka5d87d472001-04-09 14:54:21 +0000580 newurl = basejoin(self.type + ":" + url, newurl)
Guido van Rossumfa19f7c2003-05-16 01:46:51 +0000581 return self.open(newurl)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000582
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000583 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000584 """Error 301 -- also relocated (permanently)."""
585 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000586
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000587 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
588 """Error 303 -- also relocated (essentially identical to 302)."""
589 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
590
Guido van Rossumfa19f7c2003-05-16 01:46:51 +0000591 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
592 """Error 307 -- relocated, but turn POST into error."""
593 if data is None:
594 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
595 else:
596 return self.http_error_default(url, fp, errcode, errmsg, headers)
597
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000598 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000599 """Error 401 -- authentication required.
Martin v. Löwis3e865952006-01-24 15:51:21 +0000600 This function supports Basic authentication only."""
Raymond Hettinger54f02222002-06-01 14:18:47 +0000601 if not 'www-authenticate' in headers:
Tim Peters85ba6732001-02-28 08:26:44 +0000602 URLopener.http_error_default(self, url, fp,
Fred Drakec680ae82001-10-13 18:37:07 +0000603 errcode, errmsg, headers)
Moshe Zadkae99bd172001-02-27 06:27:04 +0000604 stuff = headers['www-authenticate']
605 import re
606 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
607 if not match:
Tim Peters85ba6732001-02-28 08:26:44 +0000608 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000609 errcode, errmsg, headers)
610 scheme, realm = match.groups()
611 if scheme.lower() != 'basic':
Tim Peters85ba6732001-02-28 08:26:44 +0000612 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000613 errcode, errmsg, headers)
614 name = 'retry_' + self.type + '_basic_auth'
615 if data is None:
616 return getattr(self,name)(url, realm)
617 else:
618 return getattr(self,name)(url, realm, data)
Tim Peters92037a12006-01-24 22:44:08 +0000619
Martin v. Löwis3e865952006-01-24 15:51:21 +0000620 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
621 """Error 407 -- proxy authentication required.
622 This function supports Basic authentication only."""
623 if not 'proxy-authenticate' in headers:
624 URLopener.http_error_default(self, url, fp,
625 errcode, errmsg, headers)
626 stuff = headers['proxy-authenticate']
627 import re
628 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
629 if not match:
630 URLopener.http_error_default(self, url, fp,
631 errcode, errmsg, headers)
632 scheme, realm = match.groups()
633 if scheme.lower() != 'basic':
634 URLopener.http_error_default(self, url, fp,
635 errcode, errmsg, headers)
636 name = 'retry_proxy_' + self.type + '_basic_auth'
637 if data is None:
638 return getattr(self,name)(url, realm)
639 else:
640 return getattr(self,name)(url, realm, data)
Tim Peters92037a12006-01-24 22:44:08 +0000641
Martin v. Löwis3e865952006-01-24 15:51:21 +0000642 def retry_proxy_http_basic_auth(self, url, realm, data=None):
643 host, selector = splithost(url)
644 newurl = 'http://' + host + selector
645 proxy = self.proxies['http']
646 urltype, proxyhost = splittype(proxy)
647 proxyhost, proxyselector = splithost(proxyhost)
648 i = proxyhost.find('@') + 1
649 proxyhost = proxyhost[i:]
650 user, passwd = self.get_user_passwd(proxyhost, realm, i)
651 if not (user or passwd): return None
652 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
653 self.proxies['http'] = 'http://' + proxyhost + proxyselector
654 if data is None:
655 return self.open(newurl)
656 else:
657 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000658
Martin v. Löwis3e865952006-01-24 15:51:21 +0000659 def retry_proxy_https_basic_auth(self, url, realm, data=None):
660 host, selector = splithost(url)
661 newurl = 'https://' + host + selector
662 proxy = self.proxies['https']
663 urltype, proxyhost = splittype(proxy)
664 proxyhost, proxyselector = splithost(proxyhost)
665 i = proxyhost.find('@') + 1
666 proxyhost = proxyhost[i:]
667 user, passwd = self.get_user_passwd(proxyhost, realm, i)
668 if not (user or passwd): return None
669 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
670 self.proxies['https'] = 'https://' + proxyhost + proxyselector
671 if data is None:
672 return self.open(newurl)
673 else:
674 return self.open(newurl, data)
Tim Peters92037a12006-01-24 22:44:08 +0000675
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000676 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000677 host, selector = splithost(url)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000678 i = host.find('@') + 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000679 host = host[i:]
680 user, passwd = self.get_user_passwd(host, realm, i)
681 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000682 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000683 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000684 if data is None:
685 return self.open(newurl)
686 else:
687 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000688
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000689 def retry_https_basic_auth(self, url, realm, data=None):
Tim Peterse1190062001-01-15 03:34:38 +0000690 host, selector = splithost(url)
691 i = host.find('@') + 1
692 host = host[i:]
693 user, passwd = self.get_user_passwd(host, realm, i)
694 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000695 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Martin v. Löwis3e865952006-01-24 15:51:21 +0000696 newurl = 'https://' + host + selector
697 if data is None:
698 return self.open(newurl)
699 else:
700 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000701
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000702 def get_user_passwd(self, host, realm, clear_cache = 0):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000703 key = realm + '@' + host.lower()
Raymond Hettinger54f02222002-06-01 14:18:47 +0000704 if key in self.auth_cache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000705 if clear_cache:
706 del self.auth_cache[key]
707 else:
708 return self.auth_cache[key]
709 user, passwd = self.prompt_user_passwd(host, realm)
710 if user or passwd: self.auth_cache[key] = (user, passwd)
711 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000712
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000713 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000714 """Override this in a GUI environment!"""
Guido van Rossum7cba8502007-03-19 22:23:59 +0000715 import getpass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000716 try:
Guido van Rossum7cba8502007-03-19 22:23:59 +0000717 user = input("Enter username for %s at %s: " % (realm, host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000718 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
719 (user, realm, host))
720 return user, passwd
721 except KeyboardInterrupt:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000722 print()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000723 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000724
725
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000726# Utility functions
727
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000728_localhost = None
729def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000730 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000731 global _localhost
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000732 if _localhost is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000733 _localhost = socket.gethostbyname('localhost')
734 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000735
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000736_thishost = None
737def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000738 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000739 global _thishost
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000740 if _thishost is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000741 _thishost = socket.gethostbyname(socket.gethostname())
742 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000743
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000744_ftperrors = None
745def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000746 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000747 global _ftperrors
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000748 if _ftperrors is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000749 import ftplib
750 _ftperrors = ftplib.all_errors
751 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000752
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000753_noheaders = None
754def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000755 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000756 global _noheaders
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000757 if _noheaders is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000758 import mimetools
Guido van Rossum68937b42007-05-18 00:51:22 +0000759 from io import StringIO
Raymond Hettingera6172712004-12-31 19:15:26 +0000760 _noheaders = mimetools.Message(StringIO(), 0)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000761 _noheaders.fp.close() # Recycle file descriptor
762 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000763
764
765# Utility classes
766
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000767class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000768 """Class used by open_ftp() for cache of open FTP connections."""
769
Guido van Rossume7ba4952007-06-06 23:52:48 +0000770 def __init__(self, user, passwd, host, port, dirs, timeout=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000771 self.user = user
772 self.passwd = passwd
773 self.host = host
774 self.port = port
775 self.dirs = dirs
Guido van Rossume7ba4952007-06-06 23:52:48 +0000776 self.timeout = timeout
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000777 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000778
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000779 def init(self):
780 import ftplib
781 self.busy = 0
782 self.ftp = ftplib.FTP()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000783 self.ftp.connect(self.host, self.port, self.timeout)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000784 self.ftp.login(self.user, self.passwd)
785 for dir in self.dirs:
786 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000787
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000788 def retrfile(self, file, type):
789 import ftplib
790 self.endtransfer()
791 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
792 else: cmd = 'TYPE ' + type; isdir = 0
793 try:
794 self.ftp.voidcmd(cmd)
795 except ftplib.all_errors:
796 self.init()
797 self.ftp.voidcmd(cmd)
798 conn = None
799 if file and not isdir:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000800 # Try to retrieve as a file
801 try:
802 cmd = 'RETR ' + file
803 conn = self.ftp.ntransfercmd(cmd)
Guido van Rossumb940e112007-01-10 16:19:56 +0000804 except ftplib.error_perm as reason:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000805 if str(reason)[:3] != '550':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000806 raise IOError, ('ftp error', reason), sys.exc_info()[2]
807 if not conn:
808 # Set transfer mode to ASCII!
809 self.ftp.voidcmd('TYPE A')
810 # Try a directory listing
811 if file: cmd = 'LIST ' + file
812 else: cmd = 'LIST'
813 conn = self.ftp.ntransfercmd(cmd)
814 self.busy = 1
815 # Pass back both a suitably decorated object and a retrieval length
816 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000817 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000818 def endtransfer(self):
819 if not self.busy:
820 return
821 self.busy = 0
822 try:
823 self.ftp.voidresp()
824 except ftperrors():
825 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000826
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000827 def close(self):
828 self.endtransfer()
829 try:
830 self.ftp.close()
831 except ftperrors():
832 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000833
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000834class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000835 """Base class for addinfo and addclosehook."""
836
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000837 # XXX Add a method to expose the timeout on the underlying socket?
838
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000839 def __init__(self, fp):
840 self.fp = fp
841 self.read = self.fp.read
842 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000843 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
Georg Brandl1f663572005-11-26 16:50:44 +0000844 if hasattr(self.fp, "fileno"):
845 self.fileno = self.fp.fileno
846 else:
847 self.fileno = lambda: None
Raymond Hettinger42182eb2003-03-09 05:33:33 +0000848 if hasattr(self.fp, "__iter__"):
849 self.__iter__ = self.fp.__iter__
Georg Brandla18af4e2007-04-21 15:47:16 +0000850 if hasattr(self.fp, "__next__"):
851 self.__next__ = self.fp.__next__
Guido van Rossume7b146f2000-02-04 15:28:42 +0000852
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000853 def __repr__(self):
Walter Dörwald70a6b492004-02-12 17:35:32 +0000854 return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
855 id(self), self.fp)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000856
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000857 def close(self):
858 self.read = None
859 self.readline = None
860 self.readlines = None
861 self.fileno = None
862 if self.fp: self.fp.close()
863 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000864
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000865class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000866 """Class to add a close hook to an open file."""
867
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000868 def __init__(self, fp, closehook, *hookargs):
869 addbase.__init__(self, fp)
870 self.closehook = closehook
871 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000872
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000873 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000874 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000875 if self.closehook:
Guido van Rossum68468eb2003-02-27 20:14:51 +0000876 self.closehook(*self.hookargs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000877 self.closehook = None
878 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000879
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000880class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000881 """class to add an info() method to an open file."""
882
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000883 def __init__(self, fp, headers):
884 addbase.__init__(self, fp)
885 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000886
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000887 def info(self):
888 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000889
Guido van Rossume6ad8911996-09-10 17:02:56 +0000890class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000891 """class to add info() and geturl() methods to an open file."""
892
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000893 def __init__(self, fp, headers, url):
894 addbase.__init__(self, fp)
895 self.headers = headers
896 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000897
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000898 def info(self):
899 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000900
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000901 def geturl(self):
902 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000903
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000904
Guido van Rossum7c395db1994-07-04 22:14:49 +0000905# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000906# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000907# splittype('type:opaquestring') --> 'type', 'opaquestring'
908# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000909# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
910# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000911# splitport('host:port') --> 'host', 'port'
912# splitquery('/path?query') --> '/path', 'query'
913# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000914# splitattr('/path;attr1=value1;attr2=value2;...') ->
915# '/path', ['attr1=value1', 'attr2=value2', ...]
916# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000917# unquote('abc%20def') -> 'abc def'
918# quote('abc def') -> 'abc%20def')
919
Walter Dörwald65230a22002-06-03 15:58:32 +0000920try:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000921 str
Walter Dörwald65230a22002-06-03 15:58:32 +0000922except NameError:
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000923 def _is_unicode(x):
924 return 0
Walter Dörwald65230a22002-06-03 15:58:32 +0000925else:
926 def _is_unicode(x):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000927 return isinstance(x, str)
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000928
Martin v. Löwis1d994332000-12-03 18:30:10 +0000929def toBytes(url):
930 """toBytes(u"URL") --> 'URL'."""
931 # Most URL schemes require ASCII. If that changes, the conversion
932 # can be relaxed
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000933 if _is_unicode(url):
Martin v. Löwis1d994332000-12-03 18:30:10 +0000934 try:
935 url = url.encode("ASCII")
936 except UnicodeError:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000937 raise UnicodeError("URL " + repr(url) +
938 " contains non-ASCII characters")
Martin v. Löwis1d994332000-12-03 18:30:10 +0000939 return url
940
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000941def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000942 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Guido van Rossuma0982942007-07-10 08:30:03 +0000943 url = str(url).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000944 if url[:1] == '<' and url[-1:] == '>':
Guido van Rossumb2493f82000-12-15 15:01:37 +0000945 url = url[1:-1].strip()
946 if url[:4] == 'URL:': url = url[4:].strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000947 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000948
Guido van Rossum332e1441997-09-29 23:23:46 +0000949_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000950def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000951 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000952 global _typeprog
953 if _typeprog is None:
954 import re
955 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000956
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000957 match = _typeprog.match(url)
958 if match:
959 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000960 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000961 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000962
Guido van Rossum332e1441997-09-29 23:23:46 +0000963_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000964def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000965 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000966 global _hostprog
967 if _hostprog is None:
968 import re
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000969 _hostprog = re.compile('^//([^/?]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000970
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000971 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000972 if match: return match.group(1, 2)
973 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000974
Guido van Rossum332e1441997-09-29 23:23:46 +0000975_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000976def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000977 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000978 global _userprog
979 if _userprog is None:
980 import re
Raymond Hettingerf2e45dd2002-08-18 20:08:56 +0000981 _userprog = re.compile('^(.*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000982
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000983 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000984 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000985 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000986
Guido van Rossum332e1441997-09-29 23:23:46 +0000987_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000988def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000989 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000990 global _passwdprog
991 if _passwdprog is None:
992 import re
993 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000994
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000995 match = _passwdprog.match(user)
996 if match: return match.group(1, 2)
997 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000998
Guido van Rossume7b146f2000-02-04 15:28:42 +0000999# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +00001000_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001001def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001002 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001003 global _portprog
1004 if _portprog is None:
1005 import re
1006 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001007
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001008 match = _portprog.match(host)
1009 if match: return match.group(1, 2)
1010 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001011
Guido van Rossum332e1441997-09-29 23:23:46 +00001012_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +00001013def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001014 """Split host and port, returning numeric port.
1015 Return given default port if no ':' found; defaults to -1.
1016 Return numerical port if a valid number are found after ':'.
1017 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001018 global _nportprog
1019 if _nportprog is None:
1020 import re
1021 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001022
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001023 match = _nportprog.match(host)
1024 if match:
1025 host, port = match.group(1, 2)
1026 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001027 if not port: raise ValueError, "no digits"
1028 nport = int(port)
1029 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001030 nport = None
1031 return host, nport
1032 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +00001033
Guido van Rossum332e1441997-09-29 23:23:46 +00001034_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001035def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001036 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001037 global _queryprog
1038 if _queryprog is None:
1039 import re
1040 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001041
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001042 match = _queryprog.match(url)
1043 if match: return match.group(1, 2)
1044 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001045
Guido van Rossum332e1441997-09-29 23:23:46 +00001046_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001047def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001048 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001049 global _tagprog
1050 if _tagprog is None:
1051 import re
1052 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001053
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001054 match = _tagprog.match(url)
1055 if match: return match.group(1, 2)
1056 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001057
Guido van Rossum7c395db1994-07-04 22:14:49 +00001058def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001059 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1060 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Guido van Rossumb2493f82000-12-15 15:01:37 +00001061 words = url.split(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001062 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +00001063
Guido van Rossum332e1441997-09-29 23:23:46 +00001064_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001065def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001066 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001067 global _valueprog
1068 if _valueprog is None:
1069 import re
1070 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001071
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001072 match = _valueprog.match(attr)
1073 if match: return match.group(1, 2)
1074 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001075
Raymond Hettinger803ce802005-09-10 06:49:04 +00001076_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1077_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1078
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001079def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001080 """unquote('abc%20def') -> 'abc def'."""
Raymond Hettinger803ce802005-09-10 06:49:04 +00001081 res = s.split('%')
Guido van Rossum805365e2007-05-07 22:24:25 +00001082 for i in range(1, len(res)):
Raymond Hettinger803ce802005-09-10 06:49:04 +00001083 item = res[i]
1084 try:
1085 res[i] = _hextochr[item[:2]] + item[2:]
1086 except KeyError:
1087 res[i] = '%' + item
Raymond Hettinger4b0f20d2005-10-15 16:41:53 +00001088 except UnicodeDecodeError:
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001089 res[i] = chr(int(item[:2], 16)) + item[2:]
Guido van Rossumb2493f82000-12-15 15:01:37 +00001090 return "".join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001091
Guido van Rossum0564e121996-12-13 14:47:36 +00001092def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001093 """unquote('%7e/abc+def') -> '~/abc def'"""
Brett Cannonaaeffaf2004-03-23 23:50:17 +00001094 s = s.replace('+', ' ')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001095 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001096
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001097always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001098 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001099 '0123456789' '_.-')
Guido van Rossumc0f2d2d2007-08-03 19:19:24 +00001100_safe_quoters= {}
1101
1102class Quoter:
1103 def __init__(self, safe):
1104 self.cache = {}
1105 self.safe = safe + always_safe
1106
1107 def __call__(self, c):
1108 try:
1109 return self.cache[c]
1110 except KeyError:
1111 if ord(c) < 256:
1112 res = (c in self.safe) and c or ('%%%02X' % ord(c))
1113 self.cache[c] = res
1114 return res
1115 else:
1116 return "".join(['%%%02X' % i for i in c.encode("utf-8")])
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001117
Guido van Rossum7c395db1994-07-04 22:14:49 +00001118def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001119 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001120
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001121 Each part of a URL, e.g. the path info, the query, etc., has a
1122 different set of reserved characters that must be quoted.
1123
1124 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1125 the following reserved characters.
1126
1127 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1128 "$" | ","
1129
1130 Each of these characters is reserved in some component of a URL,
1131 but not necessarily in all of them.
1132
1133 By default, the quote function is intended for quoting the path
1134 section of a URL. Thus, it will not encode '/'. This character
1135 is reserved, but in typical usage the quote function is being
1136 called on a path where the existing slash characters are used as
1137 reserved characters.
1138 """
Raymond Hettinger199d2f72005-09-09 22:27:13 +00001139 cachekey = (safe, always_safe)
1140 try:
Guido van Rossumc0f2d2d2007-08-03 19:19:24 +00001141 quoter = _safe_quoters[cachekey]
Raymond Hettinger199d2f72005-09-09 22:27:13 +00001142 except KeyError:
Guido van Rossumc0f2d2d2007-08-03 19:19:24 +00001143 quoter = Quoter(safe)
1144 _safe_quoters[cachekey] = quoter
1145 res = map(quoter, s)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001146 return ''.join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001147
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001148def quote_plus(s, safe = ''):
1149 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001150 if ' ' in s:
Raymond Hettingercf6b6322005-09-10 18:17:54 +00001151 s = quote(s, safe + ' ')
1152 return s.replace(' ', '+')
1153 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001154
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001155def urlencode(query,doseq=0):
1156 """Encode a sequence of two-element tuples or dictionary into a URL query string.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001157
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001158 If any values in the query arg are sequences and doseq is true, each
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001159 sequence element is converted to a separate parameter.
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001160
1161 If the query arg is a sequence of two-element tuples, the order of the
1162 parameters in the output will match the order of parameters in the
1163 input.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001164 """
Tim Peters658cba62001-02-09 20:06:00 +00001165
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001166 if hasattr(query,"items"):
1167 # mapping objects
1168 query = query.items()
1169 else:
1170 # it's a bother at times that strings and string-like objects are
1171 # sequences...
1172 try:
1173 # non-sequence items should not work with len()
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001174 # non-empty strings will fail this
Walter Dörwald65230a22002-06-03 15:58:32 +00001175 if len(query) and not isinstance(query[0], tuple):
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001176 raise TypeError
1177 # zero-length sequences of all types will get here and succeed,
1178 # but that's a minor nit - since the original implementation
1179 # allowed empty dicts that type of behavior probably should be
1180 # preserved for consistency
1181 except TypeError:
1182 ty,va,tb = sys.exc_info()
1183 raise TypeError, "not a valid non-string sequence or mapping object", tb
1184
Guido van Rossume7b146f2000-02-04 15:28:42 +00001185 l = []
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001186 if not doseq:
1187 # preserve old behavior
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001188 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001189 k = quote_plus(str(k))
1190 v = quote_plus(str(v))
1191 l.append(k + '=' + v)
1192 else:
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001193 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001194 k = quote_plus(str(k))
Walter Dörwald65230a22002-06-03 15:58:32 +00001195 if isinstance(v, str):
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001196 v = quote_plus(v)
1197 l.append(k + '=' + v)
Guido van Rossum4b46c0a2002-05-24 17:58:05 +00001198 elif _is_unicode(v):
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001199 # is there a reasonable way to convert to ASCII?
1200 # encode generates a string, but "replace" or "ignore"
1201 # lose information and "strict" can raise UnicodeError
1202 v = quote_plus(v.encode("ASCII","replace"))
1203 l.append(k + '=' + v)
1204 else:
1205 try:
1206 # is this a sufficient test for sequence-ness?
1207 x = len(v)
1208 except TypeError:
1209 # not a sequence
1210 v = quote_plus(str(v))
1211 l.append(k + '=' + v)
1212 else:
1213 # loop over the sequence
1214 for elt in v:
1215 l.append(k + '=' + quote_plus(str(elt)))
Guido van Rossumb2493f82000-12-15 15:01:37 +00001216 return '&'.join(l)
Guido van Rossum810a3391998-07-22 21:33:23 +00001217
Guido van Rossum442e7201996-03-20 15:33:11 +00001218# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001219def getproxies_environment():
1220 """Return a dictionary of scheme -> proxy server URL mappings.
1221
1222 Scan the environment for variables named <scheme>_proxy;
1223 this seems to be the standard convention. If you need a
1224 different way, you can pass a proxies dictionary to the
1225 [Fancy]URLopener constructor.
1226
1227 """
1228 proxies = {}
1229 for name, value in os.environ.items():
Guido van Rossumb2493f82000-12-15 15:01:37 +00001230 name = name.lower()
Mark Hammond4f570b92000-07-26 07:04:38 +00001231 if value and name[-6:] == '_proxy':
1232 proxies[name[:-6]] = value
1233 return proxies
1234
Jack Jansen11d9b062004-07-16 11:45:00 +00001235if sys.platform == 'darwin':
1236 def getproxies_internetconfig():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001237 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001238
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001239 By convention the mac uses Internet Config to store
1240 proxies. An HTTP proxy, for instance, is stored under
1241 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001242
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001243 """
1244 try:
1245 import ic
1246 except ImportError:
1247 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001248
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001249 try:
1250 config = ic.IC()
1251 except ic.error:
1252 return {}
1253 proxies = {}
1254 # HTTP:
Raymond Hettinger54f02222002-06-01 14:18:47 +00001255 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001256 try:
1257 value = config['HTTPProxyHost']
1258 except ic.error:
1259 pass
1260 else:
1261 proxies['http'] = 'http://%s' % value
1262 # FTP: XXXX To be done.
1263 # Gopher: XXXX To be done.
1264 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001265
Tim Peters55c12d42001-08-09 18:04:14 +00001266 def proxy_bypass(x):
1267 return 0
1268
Jack Jansen11d9b062004-07-16 11:45:00 +00001269 def getproxies():
1270 return getproxies_environment() or getproxies_internetconfig()
Tim Peters182b5ac2004-07-18 06:16:08 +00001271
Mark Hammond4f570b92000-07-26 07:04:38 +00001272elif os.name == 'nt':
1273 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001274 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001275
1276 Win32 uses the registry to store proxies.
1277
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001278 """
1279 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001280 try:
1281 import _winreg
1282 except ImportError:
1283 # Std module, so should be around - but you never know!
1284 return proxies
1285 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001286 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1287 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001288 proxyEnable = _winreg.QueryValueEx(internetSettings,
1289 'ProxyEnable')[0]
1290 if proxyEnable:
1291 # Returned as Unicode but problems if not converted to ASCII
1292 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1293 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001294 if '=' in proxyServer:
1295 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001296 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001297 protocol, address = p.split('=', 1)
Guido van Rossumb955d6c2002-03-31 23:38:48 +00001298 # See if address has a type:// prefix
Guido van Rossum64e5aa92002-04-02 14:38:16 +00001299 import re
1300 if not re.match('^([^/:]+)://', address):
Guido van Rossumb955d6c2002-03-31 23:38:48 +00001301 address = '%s://%s' % (protocol, address)
1302 proxies[protocol] = address
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001303 else:
1304 # Use one setting for all protocols
1305 if proxyServer[:5] == 'http:':
1306 proxies['http'] = proxyServer
1307 else:
1308 proxies['http'] = 'http://%s' % proxyServer
1309 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001310 internetSettings.Close()
1311 except (WindowsError, ValueError, TypeError):
1312 # Either registry key not found etc, or the value in an
1313 # unexpected format.
1314 # proxies already set up to be empty so nothing to do
1315 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001316 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001317
Mark Hammond4f570b92000-07-26 07:04:38 +00001318 def getproxies():
1319 """Return a dictionary of scheme -> proxy server URL mappings.
1320
1321 Returns settings gathered from the environment, if specified,
1322 or the registry.
1323
1324 """
1325 return getproxies_environment() or getproxies_registry()
Tim Peters55c12d42001-08-09 18:04:14 +00001326
1327 def proxy_bypass(host):
1328 try:
1329 import _winreg
1330 import re
Tim Peters55c12d42001-08-09 18:04:14 +00001331 except ImportError:
1332 # Std modules, so should be around - but you never know!
1333 return 0
1334 try:
1335 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1336 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1337 proxyEnable = _winreg.QueryValueEx(internetSettings,
1338 'ProxyEnable')[0]
1339 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1340 'ProxyOverride')[0])
1341 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1342 except WindowsError:
1343 return 0
1344 if not proxyEnable or not proxyOverride:
1345 return 0
1346 # try to make a host list from name and IP address.
Georg Brandl1f636702006-02-18 23:10:23 +00001347 rawHost, port = splitport(host)
1348 host = [rawHost]
Tim Peters55c12d42001-08-09 18:04:14 +00001349 try:
Georg Brandl1f636702006-02-18 23:10:23 +00001350 addr = socket.gethostbyname(rawHost)
1351 if addr != rawHost:
Tim Peters55c12d42001-08-09 18:04:14 +00001352 host.append(addr)
1353 except socket.error:
1354 pass
Georg Brandl1f636702006-02-18 23:10:23 +00001355 try:
1356 fqdn = socket.getfqdn(rawHost)
1357 if fqdn != rawHost:
1358 host.append(fqdn)
1359 except socket.error:
1360 pass
Tim Peters55c12d42001-08-09 18:04:14 +00001361 # make a check value list from the registry entry: replace the
1362 # '<local>' string by the localhost entry and the corresponding
1363 # canonical entry.
1364 proxyOverride = proxyOverride.split(';')
1365 i = 0
1366 while i < len(proxyOverride):
1367 if proxyOverride[i] == '<local>':
1368 proxyOverride[i:i+1] = ['localhost',
1369 '127.0.0.1',
1370 socket.gethostname(),
1371 socket.gethostbyname(
1372 socket.gethostname())]
1373 i += 1
1374 # print proxyOverride
1375 # now check if we match one of the registry values.
1376 for test in proxyOverride:
Tim Petersab9ba272001-08-09 21:40:30 +00001377 test = test.replace(".", r"\.") # mask dots
1378 test = test.replace("*", r".*") # change glob sequence
1379 test = test.replace("?", r".") # change glob char
Tim Peters55c12d42001-08-09 18:04:14 +00001380 for val in host:
1381 # print "%s <--> %s" %( test, val )
1382 if re.match(test, val, re.I):
1383 return 1
1384 return 0
1385
Mark Hammond4f570b92000-07-26 07:04:38 +00001386else:
1387 # By default use environment variables
1388 getproxies = getproxies_environment
1389
Tim Peters55c12d42001-08-09 18:04:14 +00001390 def proxy_bypass(host):
1391 return 0
Guido van Rossum442e7201996-03-20 15:33:11 +00001392
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001393# Test and time quote() and unquote()
1394def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001395 s = ''
1396 for i in range(256): s = s + chr(i)
1397 s = s*4
1398 t0 = time.time()
1399 qs = quote(s)
1400 uqs = unquote(qs)
1401 t1 = time.time()
1402 if uqs != s:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001403 print('Wrong!')
1404 print(repr(s))
1405 print(repr(qs))
1406 print(repr(uqs))
1407 print(round(t1 - t0, 3), 'sec')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001408
1409
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001410def reporthook(blocknum, blocksize, totalsize):
1411 # Report during remote transfers
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001412 print("Block number: %d, Block size: %d, Total size: %d" % (
1413 blocknum, blocksize, totalsize))
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001414
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001415# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001416def test(args=[]):
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001417 import string
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001418 if not args:
1419 args = [
1420 '/etc/passwd',
1421 'file:/etc/passwd',
1422 'file://localhost/etc/passwd',
Guido van Rossumd8faa362007-04-27 19:54:29 +00001423 'ftp://ftp.gnu.org/pub/README',
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001424 'http://www.python.org/index.html',
1425 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001426 if hasattr(URLopener, "open_https"):
1427 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001428 try:
1429 for url in args:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001430 print('-'*10, url, '-'*10)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001431 fn, h = urlretrieve(url, None, reporthook)
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001432 print(fn)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001433 if h:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001434 print('======')
1435 for k in h.keys(): print(k + ':', h[k])
1436 print('======')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001437 fp = open(fn, 'rb')
1438 data = fp.read()
1439 del fp
1440 if '\r' in data:
1441 table = string.maketrans("", "")
Guido van Rossumb2493f82000-12-15 15:01:37 +00001442 data = data.translate(table, "\r")
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001443 print(data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001444 fn, h = None, None
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001445 print('-'*40)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001446 finally:
1447 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001448
Guido van Rossum23490151998-06-25 02:39:00 +00001449def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001450 import getopt, sys
1451 try:
1452 opts, args = getopt.getopt(sys.argv[1:], "th")
Guido van Rossumb940e112007-01-10 16:19:56 +00001453 except getopt.error as msg:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001454 print(msg)
1455 print("Use -h for help")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001456 return
1457 t = 0
1458 for o, a in opts:
1459 if o == '-t':
1460 t = t + 1
1461 if o == '-h':
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001462 print("Usage: python urllib.py [-t] [url ...]")
1463 print("-t runs self-test;", end=' ')
1464 print("otherwise, contents of urls are printed")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001465 return
1466 if t:
1467 if t > 1:
1468 test1()
1469 test(args)
1470 else:
1471 if not args:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001472 print("Use -h for help")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001473 for url in args:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001474 print(urlopen(url).read(), end=' ')
Guido van Rossum23490151998-06-25 02:39:00 +00001475
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001476# Run test program when run as a script
1477if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001478 main()