blob: 494f578ec60f5b14c4908b8dfd69a1838ddea303 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossumf0713d32001-08-09 17:43:35 +000028import time
Guido van Rossum3c8484e1996-11-20 22:02:24 +000029import sys
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000030
Skip Montanaro40fc1602001-03-01 04:27:19 +000031__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
Skip Montanaro44d5e0c2001-03-13 19:47:16 +000033 "urlencode", "url2pathname", "pathname2url", "splittag",
34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36 "splitnport", "splitquery", "splitattr", "splitvalue",
37 "splitgophertype", "getproxies"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000038
Guido van Rossumb2493f82000-12-15 15:01:37 +000039__version__ = '1.15' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000040
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000041MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000042
Jack Jansendc3e3f61995-12-15 13:22:13 +000043# Helper for non-unix systems
44if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000045 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000046elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000047 from nturl2path import url2pathname, pathname2url
Guido van Rossumd74fb6b2001-03-02 06:43:49 +000048elif os.name == 'riscos':
49 from rourl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000050else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000051 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000052 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000053 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000054 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000055
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000056# This really consists of two pieces:
57# (1) a class which handles opening of all sorts of URLs
58# (plus assorted utilities etc.)
59# (2) a set of functions for parsing URLs
60# XXX Should these be separated out into different modules?
61
62
63# Shortcut for basic usage
64_urlopener = None
Fred Drakedf6eca72002-04-04 20:41:34 +000065def urlopen(url, data=None, proxies=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000066 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000067 global _urlopener
Fred Drakedf6eca72002-04-04 20:41:34 +000068 if proxies is not None:
69 opener = FancyURLopener(proxies=proxies)
70 elif not _urlopener:
71 opener = FancyURLopener()
72 _urlopener = opener
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000073 else:
Fred Drakedf6eca72002-04-04 20:41:34 +000074 opener = _urlopener
75 if data is None:
76 return opener.open(url)
77 else:
78 return opener.open(url, data)
Fred Drake316a7932000-08-24 01:01:26 +000079def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000080 global _urlopener
81 if not _urlopener:
82 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000083 return _urlopener.retrieve(url, filename, reporthook, data)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000084def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000085 if _urlopener:
86 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000087
88
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000089ftpcache = {}
90class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000091 """Class to open URLs.
92 This is a class rather than just a subroutine because we may need
93 more than one set of global protocol-specific options.
94 Note -- this is a base class for those who don't want the
95 automatic handling of errors type 302 (relocated) and 401
96 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000097
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000098 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000099
Guido van Rossumba311382000-08-24 16:18:04 +0000100 version = "Python-urllib/%s" % __version__
101
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000102 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000103 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000104 if proxies is None:
105 proxies = getproxies()
106 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
107 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000108 self.key_file = x509.get('key_file')
109 self.cert_file = x509.get('cert_file')
Guido van Rossumba311382000-08-24 16:18:04 +0000110 self.addheaders = [('User-agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000111 self.__tempfiles = []
112 self.__unlink = os.unlink # See cleanup()
113 self.tempcache = None
114 # Undocumented feature: if you assign {} to tempcache,
115 # it is used to cache files retrieved with
116 # self.retrieve(). This is not enabled by default
117 # since it does not work for changing documents (and I
118 # haven't got the logic to check expiration headers
119 # yet).
120 self.ftpcache = ftpcache
121 # Undocumented feature: you can use a different
122 # ftp cache by assigning to the .ftpcache member;
123 # in case you want logically independent URL openers
124 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000125
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000126 def __del__(self):
127 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000128
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000129 def close(self):
130 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000131
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000132 def cleanup(self):
133 # This code sometimes runs when the rest of this module
134 # has already been deleted, so it can't use any globals
135 # or import anything.
136 if self.__tempfiles:
137 for file in self.__tempfiles:
138 try:
139 self.__unlink(file)
Martin v. Löwis58682b72001-08-11 15:02:57 +0000140 except OSError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000141 pass
142 del self.__tempfiles[:]
143 if self.tempcache:
144 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000145
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000146 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000147 """Add a header to be used by the HTTP interface only
148 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000149 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000150
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000151 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000152 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000153 """Use URLopener().open(file) instead of open(file, 'r')."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000154 fullurl = unwrap(toBytes(fullurl))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000155 if self.tempcache and fullurl in self.tempcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000156 filename, headers = self.tempcache[fullurl]
157 fp = open(filename, 'rb')
158 return addinfourl(fp, headers, fullurl)
Martin v. Löwis1d994332000-12-03 18:30:10 +0000159 urltype, url = splittype(fullurl)
160 if not urltype:
161 urltype = 'file'
Raymond Hettinger54f02222002-06-01 14:18:47 +0000162 if urltype in self.proxies:
Martin v. Löwis1d994332000-12-03 18:30:10 +0000163 proxy = self.proxies[urltype]
164 urltype, proxyhost = splittype(proxy)
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000165 host, selector = splithost(proxyhost)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000166 url = (host, fullurl) # Signal special case to open_*()
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000167 else:
168 proxy = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000169 name = 'open_' + urltype
170 self.type = urltype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000171 if '-' in name:
172 # replace - with _
Guido van Rossumb2493f82000-12-15 15:01:37 +0000173 name = '_'.join(name.split('-'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000174 if not hasattr(self, name):
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000175 if proxy:
176 return self.open_unknown_proxy(proxy, fullurl, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000177 else:
178 return self.open_unknown(fullurl, data)
179 try:
180 if data is None:
181 return getattr(self, name)(url)
182 else:
183 return getattr(self, name)(url, data)
184 except socket.error, msg:
185 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000186
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000187 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000188 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000189 type, url = splittype(fullurl)
190 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000191
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000192 def open_unknown_proxy(self, proxy, fullurl, data=None):
193 """Overridable interface to open unknown URL type."""
194 type, url = splittype(fullurl)
195 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
196
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000197 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000198 def retrieve(self, url, filename=None, reporthook=None, data=None):
Brett Cannon7d618c72003-04-24 02:43:20 +0000199 """retrieve(url) returns (filename, headers) for a local object
Guido van Rossume7b146f2000-02-04 15:28:42 +0000200 or (tempfilename, headers) for a remote object."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000201 url = unwrap(toBytes(url))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000202 if self.tempcache and url in self.tempcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000203 return self.tempcache[url]
204 type, url1 = splittype(url)
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000205 if filename is None and (not type or type == 'file'):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000206 try:
207 fp = self.open_local_file(url1)
208 hdrs = fp.info()
209 del fp
210 return url2pathname(splithost(url1)[1]), hdrs
211 except IOError, msg:
212 pass
Fred Drake316a7932000-08-24 01:01:26 +0000213 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000214 headers = fp.info()
Guido van Rossum3b0a3292002-08-09 16:38:32 +0000215 if filename:
216 tfp = open(filename, 'wb')
217 else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000218 import tempfile
219 garbage, path = splittype(url)
220 garbage, path = splithost(path or "")
221 path, garbage = splitquery(path or "")
222 path, garbage = splitattr(path or "")
223 suffix = os.path.splitext(path)[1]
Guido van Rossum3b0a3292002-08-09 16:38:32 +0000224 (fd, filename) = tempfile.mkstemp(suffix)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000225 self.__tempfiles.append(filename)
Jeremy Hylton3bd6fde2002-10-11 14:36:24 +0000226 tfp = os.fdopen(fd, 'wb')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000227 result = filename, headers
228 if self.tempcache is not None:
229 self.tempcache[url] = result
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000230 bs = 1024*8
231 size = -1
232 blocknum = 1
233 if reporthook:
Raymond Hettinger54f02222002-06-01 14:18:47 +0000234 if "content-length" in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000235 size = int(headers["Content-Length"])
236 reporthook(0, bs, size)
237 block = fp.read(bs)
238 if reporthook:
239 reporthook(1, bs, size)
240 while block:
241 tfp.write(block)
242 block = fp.read(bs)
243 blocknum = blocknum + 1
244 if reporthook:
245 reporthook(blocknum, bs, size)
246 fp.close()
247 tfp.close()
248 del fp
249 del tfp
250 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000251
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000252 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000253
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000254 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000255 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000256 import httplib
257 user_passwd = None
Walter Dörwald65230a22002-06-03 15:58:32 +0000258 if isinstance(url, str):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000259 host, selector = splithost(url)
260 if host:
261 user_passwd, host = splituser(host)
262 host = unquote(host)
263 realhost = host
264 else:
265 host, selector = url
266 urltype, rest = splittype(selector)
267 url = rest
268 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000269 if urltype.lower() != 'http':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000270 realhost = None
271 else:
272 realhost, rest = splithost(rest)
273 if realhost:
274 user_passwd, realhost = splituser(realhost)
275 if user_passwd:
276 selector = "%s://%s%s" % (urltype, realhost, rest)
Tim Peters55c12d42001-08-09 18:04:14 +0000277 if proxy_bypass(realhost):
278 host = realhost
279
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000280 #print "proxy via http:", host, selector
281 if not host: raise IOError, ('http error', 'no host given')
282 if user_passwd:
283 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000284 auth = base64.encodestring(user_passwd).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000285 else:
286 auth = None
287 h = httplib.HTTP(host)
288 if data is not None:
289 h.putrequest('POST', selector)
290 h.putheader('Content-type', 'application/x-www-form-urlencoded')
291 h.putheader('Content-length', '%d' % len(data))
292 else:
293 h.putrequest('GET', selector)
294 if auth: h.putheader('Authorization', 'Basic %s' % auth)
295 if realhost: h.putheader('Host', realhost)
Guido van Rossum68468eb2003-02-27 20:14:51 +0000296 for args in self.addheaders: h.putheader(*args)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000297 h.endheaders()
298 if data is not None:
Fred Drakeec3dfde2001-07-04 05:18:29 +0000299 h.send(data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000300 errcode, errmsg, headers = h.getreply()
301 fp = h.getfile()
302 if errcode == 200:
303 return addinfourl(fp, headers, "http:" + url)
304 else:
305 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000306 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000307 else:
308 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000309
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000310 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000311 """Handle http errors.
312 Derived class can override this, or provide specific handlers
313 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000314 # First check if there's a specific handler for this error
315 name = 'http_error_%d' % errcode
316 if hasattr(self, name):
317 method = getattr(self, name)
318 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000319 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000320 else:
321 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000322 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000323 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000324
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000325 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000326 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000327 void = fp.read()
328 fp.close()
329 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000330
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000331 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000332 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000333 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000334 import httplib
Fred Drake567ca8e2000-08-21 21:42:42 +0000335 user_passwd = None
Walter Dörwald65230a22002-06-03 15:58:32 +0000336 if isinstance(url, str):
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000337 host, selector = splithost(url)
Fred Drake567ca8e2000-08-21 21:42:42 +0000338 if host:
339 user_passwd, host = splituser(host)
340 host = unquote(host)
341 realhost = host
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000342 else:
343 host, selector = url
344 urltype, rest = splittype(selector)
Fred Drake567ca8e2000-08-21 21:42:42 +0000345 url = rest
346 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000347 if urltype.lower() != 'https':
Fred Drake567ca8e2000-08-21 21:42:42 +0000348 realhost = None
349 else:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000350 realhost, rest = splithost(rest)
Fred Drake567ca8e2000-08-21 21:42:42 +0000351 if realhost:
352 user_passwd, realhost = splituser(realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000353 if user_passwd:
354 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000355 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000356 if not host: raise IOError, ('https error', 'no host given')
357 if user_passwd:
358 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000359 auth = base64.encodestring(user_passwd).strip()
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000360 else:
361 auth = None
362 h = httplib.HTTPS(host, 0,
363 key_file=self.key_file,
364 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000365 if data is not None:
366 h.putrequest('POST', selector)
367 h.putheader('Content-type',
368 'application/x-www-form-urlencoded')
369 h.putheader('Content-length', '%d' % len(data))
370 else:
371 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000372 if auth: h.putheader('Authorization: Basic %s' % auth)
Fred Drake567ca8e2000-08-21 21:42:42 +0000373 if realhost: h.putheader('Host', realhost)
Guido van Rossum68468eb2003-02-27 20:14:51 +0000374 for args in self.addheaders: h.putheader(*args)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000375 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000376 if data is not None:
Fred Drakeec3dfde2001-07-04 05:18:29 +0000377 h.send(data)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000378 errcode, errmsg, headers = h.getreply()
379 fp = h.getfile()
380 if errcode == 200:
Guido van Rossumb931bf32001-12-08 17:09:07 +0000381 return addinfourl(fp, headers, "https:" + url)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000382 else:
Fred Drake567ca8e2000-08-21 21:42:42 +0000383 if data is None:
384 return self.http_error(url, fp, errcode, errmsg, headers)
385 else:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000386 return self.http_error(url, fp, errcode, errmsg, headers,
387 data)
Fred Drake567ca8e2000-08-21 21:42:42 +0000388
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000389 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000390 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000391 import gopherlib
392 host, selector = splithost(url)
393 if not host: raise IOError, ('gopher error', 'no host given')
394 host = unquote(host)
395 type, selector = splitgophertype(selector)
396 selector, query = splitquery(selector)
397 selector = unquote(selector)
398 if query:
399 query = unquote(query)
400 fp = gopherlib.send_query(selector, query, host)
401 else:
402 fp = gopherlib.send_selector(selector, host)
403 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000404
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000405 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000406 """Use local file or FTP depending on form of URL."""
Jack Jansen4ef11032002-09-12 20:14:04 +0000407 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000408 return self.open_ftp(url)
409 else:
410 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000411
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000412 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000413 """Use local file."""
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000414 import mimetypes, mimetools, rfc822, StringIO
Guido van Rossumf0713d32001-08-09 17:43:35 +0000415 host, file = splithost(url)
416 localname = url2pathname(file)
Guido van Rossuma2da3052002-04-15 00:25:01 +0000417 try:
418 stats = os.stat(localname)
419 except OSError, e:
420 raise IOError(e.errno, e.strerror, e.filename)
Walter Dörwald92b48b72002-03-22 17:30:38 +0000421 size = stats.st_size
422 modified = rfc822.formatdate(stats.st_mtime)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000423 mtype = mimetypes.guess_type(url)[0]
424 headers = mimetools.Message(StringIO.StringIO(
Guido van Rossumf0713d32001-08-09 17:43:35 +0000425 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
426 (mtype or 'text/plain', size, modified)))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000427 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000428 urlfile = file
429 if file[:1] == '/':
430 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000431 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000432 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000433 host, port = splitport(host)
434 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000435 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000436 urlfile = file
437 if file[:1] == '/':
438 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000439 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000440 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000441 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000442
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000443 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000444 """Use FTP protocol."""
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000445 import mimetypes, mimetools, StringIO
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000446 host, path = splithost(url)
447 if not host: raise IOError, ('ftp error', 'no host given')
448 host, port = splitport(host)
449 user, host = splituser(host)
450 if user: user, passwd = splitpasswd(user)
451 else: passwd = None
452 host = unquote(host)
453 user = unquote(user or '')
454 passwd = unquote(passwd or '')
455 host = socket.gethostbyname(host)
456 if not port:
457 import ftplib
458 port = ftplib.FTP_PORT
459 else:
460 port = int(port)
461 path, attrs = splitattr(path)
462 path = unquote(path)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000463 dirs = path.split('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000464 dirs, file = dirs[:-1], dirs[-1]
465 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000466 if dirs and not dirs[0]: dirs[0] = '/'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000467 key = user, host, port, '/'.join(dirs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000468 # XXX thread unsafe!
469 if len(self.ftpcache) > MAXFTPCACHE:
470 # Prune the cache, rather arbitrarily
471 for k in self.ftpcache.keys():
472 if k != key:
473 v = self.ftpcache[k]
474 del self.ftpcache[k]
475 v.close()
476 try:
Raymond Hettinger54f02222002-06-01 14:18:47 +0000477 if not key in self.ftpcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000478 self.ftpcache[key] = \
479 ftpwrapper(user, passwd, host, port, dirs)
480 if not file: type = 'D'
481 else: type = 'I'
482 for attr in attrs:
483 attr, value = splitvalue(attr)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000484 if attr.lower() == 'type' and \
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000485 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000486 type = value.upper()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000487 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000488 mtype = mimetypes.guess_type("ftp:" + url)[0]
489 headers = ""
490 if mtype:
491 headers += "Content-Type: %s\n" % mtype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000492 if retrlen is not None and retrlen >= 0:
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000493 headers += "Content-Length: %d\n" % retrlen
494 headers = mimetools.Message(StringIO.StringIO(headers))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000495 return addinfourl(fp, headers, "ftp:" + url)
496 except ftperrors(), msg:
497 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000498
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000499 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000500 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000501 # ignore POSTed data
502 #
503 # syntax of data URLs:
504 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
505 # mediatype := [ type "/" subtype ] *( ";" parameter )
506 # data := *urlchar
507 # parameter := attribute "=" value
Neal Norwitzaad18492002-03-26 16:25:01 +0000508 import StringIO, mimetools
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000509 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000510 [type, data] = url.split(',', 1)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000511 except ValueError:
512 raise IOError, ('data error', 'bad data URL')
513 if not type:
514 type = 'text/plain;charset=US-ASCII'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000515 semi = type.rfind(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000516 if semi >= 0 and '=' not in type[semi:]:
517 encoding = type[semi+1:]
518 type = type[:semi]
519 else:
520 encoding = ''
521 msg = []
522 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
523 time.gmtime(time.time())))
524 msg.append('Content-type: %s' % type)
525 if encoding == 'base64':
526 import base64
527 data = base64.decodestring(data)
528 else:
529 data = unquote(data)
530 msg.append('Content-length: %d' % len(data))
531 msg.append('')
532 msg.append(data)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000533 msg = '\n'.join(msg)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000534 f = StringIO.StringIO(msg)
535 headers = mimetools.Message(f, 0)
536 f.fileno = None # needed for addinfourl
537 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000538
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000539
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000540class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000541 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000542
Neal Norwitz60e04cd2002-06-11 13:38:51 +0000543 def __init__(self, *args, **kwargs):
Guido van Rossum68468eb2003-02-27 20:14:51 +0000544 URLopener.__init__(self, *args, **kwargs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000545 self.auth_cache = {}
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000546 self.tries = 0
547 self.maxtries = 10
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000548
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000549 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000550 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000551 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000552
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000553 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000554 """Error 302 -- relocated (temporarily)."""
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000555 self.tries += 1
556 if self.maxtries and self.tries >= self.maxtries:
557 if hasattr(self, "http_error_500"):
558 meth = self.http_error_500
559 else:
560 meth = self.http_error_default
561 self.tries = 0
562 return meth(url, fp, 500,
563 "Internal Server Error: Redirect Recursion", headers)
564 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
565 data)
566 self.tries = 0
567 return result
568
569 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000570 if 'location' in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000571 newurl = headers['location']
Raymond Hettinger54f02222002-06-01 14:18:47 +0000572 elif 'uri' in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000573 newurl = headers['uri']
574 else:
575 return
576 void = fp.read()
577 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000578 # In case the server sent a relative URL, join with original:
Moshe Zadka5d87d472001-04-09 14:54:21 +0000579 newurl = basejoin(self.type + ":" + url, newurl)
Guido van Rossumfa19f7c2003-05-16 01:46:51 +0000580 return self.open(newurl)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000581
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000582 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000583 """Error 301 -- also relocated (permanently)."""
584 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000585
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000586 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
587 """Error 303 -- also relocated (essentially identical to 302)."""
588 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
589
Guido van Rossumfa19f7c2003-05-16 01:46:51 +0000590 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
591 """Error 307 -- relocated, but turn POST into error."""
592 if data is None:
593 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
594 else:
595 return self.http_error_default(url, fp, errcode, errmsg, headers)
596
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000597 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000598 """Error 401 -- authentication required.
599 See this URL for a description of the basic authentication scheme:
600 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Raymond Hettinger54f02222002-06-01 14:18:47 +0000601 if not 'www-authenticate' in headers:
Tim Peters85ba6732001-02-28 08:26:44 +0000602 URLopener.http_error_default(self, url, fp,
Fred Drakec680ae82001-10-13 18:37:07 +0000603 errcode, errmsg, headers)
Moshe Zadkae99bd172001-02-27 06:27:04 +0000604 stuff = headers['www-authenticate']
605 import re
606 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
607 if not match:
Tim Peters85ba6732001-02-28 08:26:44 +0000608 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000609 errcode, errmsg, headers)
610 scheme, realm = match.groups()
611 if scheme.lower() != 'basic':
Tim Peters85ba6732001-02-28 08:26:44 +0000612 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000613 errcode, errmsg, headers)
614 name = 'retry_' + self.type + '_basic_auth'
615 if data is None:
616 return getattr(self,name)(url, realm)
617 else:
618 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000619
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000620 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000621 host, selector = splithost(url)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000622 i = host.find('@') + 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000623 host = host[i:]
624 user, passwd = self.get_user_passwd(host, realm, i)
625 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000626 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000627 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000628 if data is None:
629 return self.open(newurl)
630 else:
631 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000632
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000633 def retry_https_basic_auth(self, url, realm, data=None):
Tim Peterse1190062001-01-15 03:34:38 +0000634 host, selector = splithost(url)
635 i = host.find('@') + 1
636 host = host[i:]
637 user, passwd = self.get_user_passwd(host, realm, i)
638 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000639 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Tim Peterse1190062001-01-15 03:34:38 +0000640 newurl = '//' + host + selector
Guido van Rossumafc4f042001-01-15 18:31:13 +0000641 return self.open_https(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000642
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000643 def get_user_passwd(self, host, realm, clear_cache = 0):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000644 key = realm + '@' + host.lower()
Raymond Hettinger54f02222002-06-01 14:18:47 +0000645 if key in self.auth_cache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000646 if clear_cache:
647 del self.auth_cache[key]
648 else:
649 return self.auth_cache[key]
650 user, passwd = self.prompt_user_passwd(host, realm)
651 if user or passwd: self.auth_cache[key] = (user, passwd)
652 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000653
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000654 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000655 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000656 import getpass
657 try:
658 user = raw_input("Enter username for %s at %s: " % (realm,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000659 host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000660 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
661 (user, realm, host))
662 return user, passwd
663 except KeyboardInterrupt:
664 print
665 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000666
667
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000668# Utility functions
669
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000670_localhost = None
671def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000672 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000673 global _localhost
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000674 if _localhost is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000675 _localhost = socket.gethostbyname('localhost')
676 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000677
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000678_thishost = None
679def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000680 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000681 global _thishost
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000682 if _thishost is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000683 _thishost = socket.gethostbyname(socket.gethostname())
684 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000685
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000686_ftperrors = None
687def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000688 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000689 global _ftperrors
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000690 if _ftperrors is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000691 import ftplib
692 _ftperrors = ftplib.all_errors
693 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000694
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000695_noheaders = None
696def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000697 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000698 global _noheaders
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000699 if _noheaders is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000700 import mimetools
701 import StringIO
702 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
703 _noheaders.fp.close() # Recycle file descriptor
704 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000705
706
707# Utility classes
708
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000709class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000710 """Class used by open_ftp() for cache of open FTP connections."""
711
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000712 def __init__(self, user, passwd, host, port, dirs):
713 self.user = user
714 self.passwd = passwd
715 self.host = host
716 self.port = port
717 self.dirs = dirs
718 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000719
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000720 def init(self):
721 import ftplib
722 self.busy = 0
723 self.ftp = ftplib.FTP()
724 self.ftp.connect(self.host, self.port)
725 self.ftp.login(self.user, self.passwd)
726 for dir in self.dirs:
727 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000728
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000729 def retrfile(self, file, type):
730 import ftplib
731 self.endtransfer()
732 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
733 else: cmd = 'TYPE ' + type; isdir = 0
734 try:
735 self.ftp.voidcmd(cmd)
736 except ftplib.all_errors:
737 self.init()
738 self.ftp.voidcmd(cmd)
739 conn = None
740 if file and not isdir:
741 # Use nlst to see if the file exists at all
742 try:
743 self.ftp.nlst(file)
744 except ftplib.error_perm, reason:
745 raise IOError, ('ftp error', reason), sys.exc_info()[2]
746 # Restore the transfer mode!
747 self.ftp.voidcmd(cmd)
748 # Try to retrieve as a file
749 try:
750 cmd = 'RETR ' + file
751 conn = self.ftp.ntransfercmd(cmd)
752 except ftplib.error_perm, reason:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000753 if str(reason)[:3] != '550':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000754 raise IOError, ('ftp error', reason), sys.exc_info()[2]
755 if not conn:
756 # Set transfer mode to ASCII!
757 self.ftp.voidcmd('TYPE A')
758 # Try a directory listing
759 if file: cmd = 'LIST ' + file
760 else: cmd = 'LIST'
761 conn = self.ftp.ntransfercmd(cmd)
762 self.busy = 1
763 # Pass back both a suitably decorated object and a retrieval length
764 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000765 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000766 def endtransfer(self):
767 if not self.busy:
768 return
769 self.busy = 0
770 try:
771 self.ftp.voidresp()
772 except ftperrors():
773 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000774
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000775 def close(self):
776 self.endtransfer()
777 try:
778 self.ftp.close()
779 except ftperrors():
780 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000781
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000782class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000783 """Base class for addinfo and addclosehook."""
784
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000785 def __init__(self, fp):
786 self.fp = fp
787 self.read = self.fp.read
788 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000789 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
790 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Raymond Hettinger42182eb2003-03-09 05:33:33 +0000791 if hasattr(self.fp, "__iter__"):
792 self.__iter__ = self.fp.__iter__
793 if hasattr(self.fp, "next"):
794 self.next = self.fp.next
Guido van Rossume7b146f2000-02-04 15:28:42 +0000795
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000796 def __repr__(self):
797 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000798 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000799
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000800 def close(self):
801 self.read = None
802 self.readline = None
803 self.readlines = None
804 self.fileno = None
805 if self.fp: self.fp.close()
806 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000807
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000808class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000809 """Class to add a close hook to an open file."""
810
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000811 def __init__(self, fp, closehook, *hookargs):
812 addbase.__init__(self, fp)
813 self.closehook = closehook
814 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000815
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000816 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000817 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000818 if self.closehook:
Guido van Rossum68468eb2003-02-27 20:14:51 +0000819 self.closehook(*self.hookargs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000820 self.closehook = None
821 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000822
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000823class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000824 """class to add an info() method to an open file."""
825
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000826 def __init__(self, fp, headers):
827 addbase.__init__(self, fp)
828 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000829
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000830 def info(self):
831 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000832
Guido van Rossume6ad8911996-09-10 17:02:56 +0000833class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000834 """class to add info() and geturl() methods to an open file."""
835
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000836 def __init__(self, fp, headers, url):
837 addbase.__init__(self, fp)
838 self.headers = headers
839 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000840
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000841 def info(self):
842 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000843
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000844 def geturl(self):
845 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000846
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000847
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000848def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000849 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000850 type, path = splittype(url)
851 if type:
852 # if url is complete (i.e., it contains a type), return it
853 return url
854 host, path = splithost(path)
855 type, basepath = splittype(base) # inherit type from base
856 if host:
857 # if url contains host, just inherit type
858 if type: return type + '://' + host + path
859 else:
860 # no type inherited, so url must have started with //
861 # just return it
862 return url
863 host, basepath = splithost(basepath) # inherit host
Thomas Wouters7e474022000-07-16 12:04:32 +0000864 basepath, basetag = splittag(basepath) # remove extraneous cruft
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000865 basepath, basequery = splitquery(basepath) # idem
866 if path[:1] != '/':
867 # non-absolute path name
868 if path[:1] in ('#', '?'):
869 # path is just a tag or query, attach to basepath
870 i = len(basepath)
871 else:
872 # else replace last component
Guido van Rossumb2493f82000-12-15 15:01:37 +0000873 i = basepath.rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000874 if i < 0:
875 # basepath not absolute
876 if host:
877 # host present, make absolute
878 basepath = '/'
879 else:
880 # else keep non-absolute
881 basepath = ''
882 else:
883 # remove last file component
884 basepath = basepath[:i+1]
885 # Interpret ../ (important because of symlinks)
886 while basepath and path[:3] == '../':
887 path = path[3:]
Guido van Rossumb2493f82000-12-15 15:01:37 +0000888 i = basepath[:-1].rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000889 if i > 0:
890 basepath = basepath[:i+1]
891 elif i == 0:
892 basepath = '/'
893 break
894 else:
895 basepath = ''
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000896
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000897 path = basepath + path
Guido van Rossumb8bf3be2001-04-15 20:47:33 +0000898 if host and path and path[0] != '/':
899 path = '/' + path
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000900 if type and host: return type + '://' + host + path
901 elif type: return type + ':' + path
902 elif host: return '//' + host + path # don't know what this means
903 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000904
905
Guido van Rossum7c395db1994-07-04 22:14:49 +0000906# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000907# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000908# splittype('type:opaquestring') --> 'type', 'opaquestring'
909# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000910# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
911# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000912# splitport('host:port') --> 'host', 'port'
913# splitquery('/path?query') --> '/path', 'query'
914# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000915# splitattr('/path;attr1=value1;attr2=value2;...') ->
916# '/path', ['attr1=value1', 'attr2=value2', ...]
917# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000918# splitgophertype('/Xselector') --> 'X', 'selector'
919# unquote('abc%20def') -> 'abc def'
920# quote('abc def') -> 'abc%20def')
921
Walter Dörwald65230a22002-06-03 15:58:32 +0000922try:
923 unicode
924except NameError:
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000925 def _is_unicode(x):
926 return 0
Walter Dörwald65230a22002-06-03 15:58:32 +0000927else:
928 def _is_unicode(x):
929 return isinstance(x, unicode)
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000930
Martin v. Löwis1d994332000-12-03 18:30:10 +0000931def toBytes(url):
932 """toBytes(u"URL") --> 'URL'."""
933 # Most URL schemes require ASCII. If that changes, the conversion
934 # can be relaxed
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000935 if _is_unicode(url):
Martin v. Löwis1d994332000-12-03 18:30:10 +0000936 try:
937 url = url.encode("ASCII")
938 except UnicodeError:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000939 raise UnicodeError("URL " + repr(url) +
940 " contains non-ASCII characters")
Martin v. Löwis1d994332000-12-03 18:30:10 +0000941 return url
942
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000943def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000944 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Guido van Rossumb2493f82000-12-15 15:01:37 +0000945 url = url.strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000946 if url[:1] == '<' and url[-1:] == '>':
Guido van Rossumb2493f82000-12-15 15:01:37 +0000947 url = url[1:-1].strip()
948 if url[:4] == 'URL:': url = url[4:].strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000949 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000950
Guido van Rossum332e1441997-09-29 23:23:46 +0000951_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000952def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000953 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000954 global _typeprog
955 if _typeprog is None:
956 import re
957 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000958
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000959 match = _typeprog.match(url)
960 if match:
961 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000962 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000963 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000964
Guido van Rossum332e1441997-09-29 23:23:46 +0000965_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000966def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000967 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000968 global _hostprog
969 if _hostprog is None:
970 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000971 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000972
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000973 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000974 if match: return match.group(1, 2)
975 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000976
Guido van Rossum332e1441997-09-29 23:23:46 +0000977_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000978def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000979 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000980 global _userprog
981 if _userprog is None:
982 import re
Raymond Hettingerf2e45dd2002-08-18 20:08:56 +0000983 _userprog = re.compile('^(.*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000984
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000985 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000986 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000987 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000988
Guido van Rossum332e1441997-09-29 23:23:46 +0000989_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000990def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000991 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000992 global _passwdprog
993 if _passwdprog is None:
994 import re
995 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000996
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000997 match = _passwdprog.match(user)
998 if match: return match.group(1, 2)
999 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001000
Guido van Rossume7b146f2000-02-04 15:28:42 +00001001# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +00001002_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001003def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001004 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001005 global _portprog
1006 if _portprog is None:
1007 import re
1008 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001009
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001010 match = _portprog.match(host)
1011 if match: return match.group(1, 2)
1012 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001013
Guido van Rossum332e1441997-09-29 23:23:46 +00001014_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +00001015def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001016 """Split host and port, returning numeric port.
1017 Return given default port if no ':' found; defaults to -1.
1018 Return numerical port if a valid number are found after ':'.
1019 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001020 global _nportprog
1021 if _nportprog is None:
1022 import re
1023 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001024
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001025 match = _nportprog.match(host)
1026 if match:
1027 host, port = match.group(1, 2)
1028 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001029 if not port: raise ValueError, "no digits"
1030 nport = int(port)
1031 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001032 nport = None
1033 return host, nport
1034 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +00001035
Guido van Rossum332e1441997-09-29 23:23:46 +00001036_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001037def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001038 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001039 global _queryprog
1040 if _queryprog is None:
1041 import re
1042 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001043
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001044 match = _queryprog.match(url)
1045 if match: return match.group(1, 2)
1046 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001047
Guido van Rossum332e1441997-09-29 23:23:46 +00001048_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001049def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001050 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001051 global _tagprog
1052 if _tagprog is None:
1053 import re
1054 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001055
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001056 match = _tagprog.match(url)
1057 if match: return match.group(1, 2)
1058 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001059
Guido van Rossum7c395db1994-07-04 22:14:49 +00001060def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001061 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1062 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Guido van Rossumb2493f82000-12-15 15:01:37 +00001063 words = url.split(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001064 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +00001065
Guido van Rossum332e1441997-09-29 23:23:46 +00001066_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001067def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001068 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001069 global _valueprog
1070 if _valueprog is None:
1071 import re
1072 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001073
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001074 match = _valueprog.match(attr)
1075 if match: return match.group(1, 2)
1076 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001077
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001078def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001079 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001080 if selector[:1] == '/' and selector[1:2]:
1081 return selector[1], selector[2:]
1082 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001083
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001084def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001085 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001086 mychr = chr
Guido van Rossumb2493f82000-12-15 15:01:37 +00001087 myatoi = int
1088 list = s.split('%')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001089 res = [list[0]]
1090 myappend = res.append
1091 del list[0]
1092 for item in list:
1093 if item[1:2]:
1094 try:
1095 myappend(mychr(myatoi(item[:2], 16))
1096 + item[2:])
Martin v. Löwis58682b72001-08-11 15:02:57 +00001097 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001098 myappend('%' + item)
1099 else:
1100 myappend('%' + item)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001101 return "".join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001102
Guido van Rossum0564e121996-12-13 14:47:36 +00001103def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001104 """unquote('%7e/abc+def') -> '~/abc def'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001105 if '+' in s:
1106 # replace '+' with ' '
Guido van Rossumb2493f82000-12-15 15:01:37 +00001107 s = ' '.join(s.split('+'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001108 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001109
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001110always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001111 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001112 '0123456789' '_.-')
1113
1114_fast_safe_test = always_safe + '/'
1115_fast_safe = None
1116
1117def _fast_quote(s):
1118 global _fast_safe
1119 if _fast_safe is None:
1120 _fast_safe = {}
1121 for c in _fast_safe_test:
1122 _fast_safe[c] = c
1123 res = list(s)
1124 for i in range(len(res)):
1125 c = res[i]
Raymond Hettinger54f02222002-06-01 14:18:47 +00001126 if not c in _fast_safe:
Guido van Rossume27a7b82001-01-19 03:28:15 +00001127 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001128 return ''.join(res)
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001129
Guido van Rossum7c395db1994-07-04 22:14:49 +00001130def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001131 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001132
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001133 Each part of a URL, e.g. the path info, the query, etc., has a
1134 different set of reserved characters that must be quoted.
1135
1136 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1137 the following reserved characters.
1138
1139 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1140 "$" | ","
1141
1142 Each of these characters is reserved in some component of a URL,
1143 but not necessarily in all of them.
1144
1145 By default, the quote function is intended for quoting the path
1146 section of a URL. Thus, it will not encode '/'. This character
1147 is reserved, but in typical usage the quote function is being
1148 called on a path where the existing slash characters are used as
1149 reserved characters.
1150 """
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001151 safe = always_safe + safe
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001152 if _fast_safe_test == safe:
1153 return _fast_quote(s)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001154 res = list(s)
1155 for i in range(len(res)):
1156 c = res[i]
1157 if c not in safe:
Guido van Rossume27a7b82001-01-19 03:28:15 +00001158 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001159 return ''.join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001160
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001161def quote_plus(s, safe = ''):
1162 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001163 if ' ' in s:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001164 l = s.split(' ')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001165 for i in range(len(l)):
1166 l[i] = quote(l[i], safe)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001167 return '+'.join(l)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001168 else:
1169 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001170
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001171def urlencode(query,doseq=0):
1172 """Encode a sequence of two-element tuples or dictionary into a URL query string.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001173
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001174 If any values in the query arg are sequences and doseq is true, each
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001175 sequence element is converted to a separate parameter.
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001176
1177 If the query arg is a sequence of two-element tuples, the order of the
1178 parameters in the output will match the order of parameters in the
1179 input.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001180 """
Tim Peters658cba62001-02-09 20:06:00 +00001181
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001182 if hasattr(query,"items"):
1183 # mapping objects
1184 query = query.items()
1185 else:
1186 # it's a bother at times that strings and string-like objects are
1187 # sequences...
1188 try:
1189 # non-sequence items should not work with len()
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001190 # non-empty strings will fail this
Walter Dörwald65230a22002-06-03 15:58:32 +00001191 if len(query) and not isinstance(query[0], tuple):
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001192 raise TypeError
1193 # zero-length sequences of all types will get here and succeed,
1194 # but that's a minor nit - since the original implementation
1195 # allowed empty dicts that type of behavior probably should be
1196 # preserved for consistency
1197 except TypeError:
1198 ty,va,tb = sys.exc_info()
1199 raise TypeError, "not a valid non-string sequence or mapping object", tb
1200
Guido van Rossume7b146f2000-02-04 15:28:42 +00001201 l = []
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001202 if not doseq:
1203 # preserve old behavior
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001204 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001205 k = quote_plus(str(k))
1206 v = quote_plus(str(v))
1207 l.append(k + '=' + v)
1208 else:
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001209 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001210 k = quote_plus(str(k))
Walter Dörwald65230a22002-06-03 15:58:32 +00001211 if isinstance(v, str):
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001212 v = quote_plus(v)
1213 l.append(k + '=' + v)
Guido van Rossum4b46c0a2002-05-24 17:58:05 +00001214 elif _is_unicode(v):
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001215 # is there a reasonable way to convert to ASCII?
1216 # encode generates a string, but "replace" or "ignore"
1217 # lose information and "strict" can raise UnicodeError
1218 v = quote_plus(v.encode("ASCII","replace"))
1219 l.append(k + '=' + v)
1220 else:
1221 try:
1222 # is this a sufficient test for sequence-ness?
1223 x = len(v)
1224 except TypeError:
1225 # not a sequence
1226 v = quote_plus(str(v))
1227 l.append(k + '=' + v)
1228 else:
1229 # loop over the sequence
1230 for elt in v:
1231 l.append(k + '=' + quote_plus(str(elt)))
Guido van Rossumb2493f82000-12-15 15:01:37 +00001232 return '&'.join(l)
Guido van Rossum810a3391998-07-22 21:33:23 +00001233
Guido van Rossum442e7201996-03-20 15:33:11 +00001234# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001235def getproxies_environment():
1236 """Return a dictionary of scheme -> proxy server URL mappings.
1237
1238 Scan the environment for variables named <scheme>_proxy;
1239 this seems to be the standard convention. If you need a
1240 different way, you can pass a proxies dictionary to the
1241 [Fancy]URLopener constructor.
1242
1243 """
1244 proxies = {}
1245 for name, value in os.environ.items():
Guido van Rossumb2493f82000-12-15 15:01:37 +00001246 name = name.lower()
Mark Hammond4f570b92000-07-26 07:04:38 +00001247 if value and name[-6:] == '_proxy':
1248 proxies[name[:-6]] = value
1249 return proxies
1250
Guido van Rossum4163e701998-08-06 13:39:09 +00001251if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001252 def getproxies():
1253 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001254
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001255 By convention the mac uses Internet Config to store
1256 proxies. An HTTP proxy, for instance, is stored under
1257 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001258
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001259 """
1260 try:
1261 import ic
1262 except ImportError:
1263 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001264
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001265 try:
1266 config = ic.IC()
1267 except ic.error:
1268 return {}
1269 proxies = {}
1270 # HTTP:
Raymond Hettinger54f02222002-06-01 14:18:47 +00001271 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001272 try:
1273 value = config['HTTPProxyHost']
1274 except ic.error:
1275 pass
1276 else:
1277 proxies['http'] = 'http://%s' % value
1278 # FTP: XXXX To be done.
1279 # Gopher: XXXX To be done.
1280 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001281
Tim Peters55c12d42001-08-09 18:04:14 +00001282 def proxy_bypass(x):
1283 return 0
1284
Mark Hammond4f570b92000-07-26 07:04:38 +00001285elif os.name == 'nt':
1286 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001287 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001288
1289 Win32 uses the registry to store proxies.
1290
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001291 """
1292 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001293 try:
1294 import _winreg
1295 except ImportError:
1296 # Std module, so should be around - but you never know!
1297 return proxies
1298 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001299 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1300 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001301 proxyEnable = _winreg.QueryValueEx(internetSettings,
1302 'ProxyEnable')[0]
1303 if proxyEnable:
1304 # Returned as Unicode but problems if not converted to ASCII
1305 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1306 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001307 if '=' in proxyServer:
1308 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001309 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001310 protocol, address = p.split('=', 1)
Guido van Rossumb955d6c2002-03-31 23:38:48 +00001311 # See if address has a type:// prefix
Guido van Rossum64e5aa92002-04-02 14:38:16 +00001312 import re
1313 if not re.match('^([^/:]+)://', address):
Guido van Rossumb955d6c2002-03-31 23:38:48 +00001314 address = '%s://%s' % (protocol, address)
1315 proxies[protocol] = address
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001316 else:
1317 # Use one setting for all protocols
1318 if proxyServer[:5] == 'http:':
1319 proxies['http'] = proxyServer
1320 else:
1321 proxies['http'] = 'http://%s' % proxyServer
1322 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001323 internetSettings.Close()
1324 except (WindowsError, ValueError, TypeError):
1325 # Either registry key not found etc, or the value in an
1326 # unexpected format.
1327 # proxies already set up to be empty so nothing to do
1328 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001329 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001330
Mark Hammond4f570b92000-07-26 07:04:38 +00001331 def getproxies():
1332 """Return a dictionary of scheme -> proxy server URL mappings.
1333
1334 Returns settings gathered from the environment, if specified,
1335 or the registry.
1336
1337 """
1338 return getproxies_environment() or getproxies_registry()
Tim Peters55c12d42001-08-09 18:04:14 +00001339
1340 def proxy_bypass(host):
1341 try:
1342 import _winreg
1343 import re
Tim Peters55c12d42001-08-09 18:04:14 +00001344 except ImportError:
1345 # Std modules, so should be around - but you never know!
1346 return 0
1347 try:
1348 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1349 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1350 proxyEnable = _winreg.QueryValueEx(internetSettings,
1351 'ProxyEnable')[0]
1352 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1353 'ProxyOverride')[0])
1354 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1355 except WindowsError:
1356 return 0
1357 if not proxyEnable or not proxyOverride:
1358 return 0
1359 # try to make a host list from name and IP address.
1360 host = [host]
1361 try:
1362 addr = socket.gethostbyname(host[0])
1363 if addr != host:
1364 host.append(addr)
1365 except socket.error:
1366 pass
1367 # make a check value list from the registry entry: replace the
1368 # '<local>' string by the localhost entry and the corresponding
1369 # canonical entry.
1370 proxyOverride = proxyOverride.split(';')
1371 i = 0
1372 while i < len(proxyOverride):
1373 if proxyOverride[i] == '<local>':
1374 proxyOverride[i:i+1] = ['localhost',
1375 '127.0.0.1',
1376 socket.gethostname(),
1377 socket.gethostbyname(
1378 socket.gethostname())]
1379 i += 1
1380 # print proxyOverride
1381 # now check if we match one of the registry values.
1382 for test in proxyOverride:
Tim Petersab9ba272001-08-09 21:40:30 +00001383 test = test.replace(".", r"\.") # mask dots
1384 test = test.replace("*", r".*") # change glob sequence
1385 test = test.replace("?", r".") # change glob char
Tim Peters55c12d42001-08-09 18:04:14 +00001386 for val in host:
1387 # print "%s <--> %s" %( test, val )
1388 if re.match(test, val, re.I):
1389 return 1
1390 return 0
1391
Mark Hammond4f570b92000-07-26 07:04:38 +00001392else:
1393 # By default use environment variables
1394 getproxies = getproxies_environment
1395
Tim Peters55c12d42001-08-09 18:04:14 +00001396 def proxy_bypass(host):
1397 return 0
Guido van Rossum442e7201996-03-20 15:33:11 +00001398
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001399# Test and time quote() and unquote()
1400def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001401 s = ''
1402 for i in range(256): s = s + chr(i)
1403 s = s*4
1404 t0 = time.time()
1405 qs = quote(s)
1406 uqs = unquote(qs)
1407 t1 = time.time()
1408 if uqs != s:
1409 print 'Wrong!'
1410 print `s`
1411 print `qs`
1412 print `uqs`
1413 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001414
1415
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001416def reporthook(blocknum, blocksize, totalsize):
1417 # Report during remote transfers
Guido van Rossumb2493f82000-12-15 15:01:37 +00001418 print "Block number: %d, Block size: %d, Total size: %d" % (
1419 blocknum, blocksize, totalsize)
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001420
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001421# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001422def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001423 if not args:
1424 args = [
1425 '/etc/passwd',
1426 'file:/etc/passwd',
1427 'file://localhost/etc/passwd',
Andrew M. Kuchling56a42352002-03-18 22:18:46 +00001428 'ftp://ftp.python.org/pub/python/README',
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001429## 'gopher://gopher.micro.umn.edu/1/',
1430 'http://www.python.org/index.html',
1431 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001432 if hasattr(URLopener, "open_https"):
1433 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001434 try:
1435 for url in args:
1436 print '-'*10, url, '-'*10
1437 fn, h = urlretrieve(url, None, reporthook)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001438 print fn
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001439 if h:
1440 print '======'
1441 for k in h.keys(): print k + ':', h[k]
1442 print '======'
1443 fp = open(fn, 'rb')
1444 data = fp.read()
1445 del fp
1446 if '\r' in data:
1447 table = string.maketrans("", "")
Guido van Rossumb2493f82000-12-15 15:01:37 +00001448 data = data.translate(table, "\r")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001449 print data
1450 fn, h = None, None
1451 print '-'*40
1452 finally:
1453 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001454
Guido van Rossum23490151998-06-25 02:39:00 +00001455def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001456 import getopt, sys
1457 try:
1458 opts, args = getopt.getopt(sys.argv[1:], "th")
1459 except getopt.error, msg:
1460 print msg
1461 print "Use -h for help"
1462 return
1463 t = 0
1464 for o, a in opts:
1465 if o == '-t':
1466 t = t + 1
1467 if o == '-h':
1468 print "Usage: python urllib.py [-t] [url ...]"
1469 print "-t runs self-test;",
1470 print "otherwise, contents of urls are printed"
1471 return
1472 if t:
1473 if t > 1:
1474 test1()
1475 test(args)
1476 else:
1477 if not args:
1478 print "Use -h for help"
1479 for url in args:
1480 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001481
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001482# Run test program when run as a script
1483if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001484 main()