blob: 381d54ec9d9b3a504138976d66749ff2afd64645 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossumf0713d32001-08-09 17:43:35 +000028import time
Guido van Rossum3c8484e1996-11-20 22:02:24 +000029import sys
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000030
Skip Montanaro40fc1602001-03-01 04:27:19 +000031__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
Skip Montanaro44d5e0c2001-03-13 19:47:16 +000033 "urlencode", "url2pathname", "pathname2url", "splittag",
34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36 "splitnport", "splitquery", "splitattr", "splitvalue",
37 "splitgophertype", "getproxies"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000038
Guido van Rossumb2493f82000-12-15 15:01:37 +000039__version__ = '1.15' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000040
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000041MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000042
Jack Jansendc3e3f61995-12-15 13:22:13 +000043# Helper for non-unix systems
44if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000045 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000046elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000047 from nturl2path import url2pathname, pathname2url
Guido van Rossumd74fb6b2001-03-02 06:43:49 +000048elif os.name == 'riscos':
49 from rourl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000050else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000051 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000052 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000053 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000054 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000055
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000056# This really consists of two pieces:
57# (1) a class which handles opening of all sorts of URLs
58# (plus assorted utilities etc.)
59# (2) a set of functions for parsing URLs
60# XXX Should these be separated out into different modules?
61
62
63# Shortcut for basic usage
64_urlopener = None
Fred Drakedf6eca72002-04-04 20:41:34 +000065def urlopen(url, data=None, proxies=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000066 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000067 global _urlopener
Fred Drakedf6eca72002-04-04 20:41:34 +000068 if proxies is not None:
69 opener = FancyURLopener(proxies=proxies)
70 elif not _urlopener:
71 opener = FancyURLopener()
72 _urlopener = opener
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000073 else:
Fred Drakedf6eca72002-04-04 20:41:34 +000074 opener = _urlopener
75 if data is None:
76 return opener.open(url)
77 else:
78 return opener.open(url, data)
Fred Drake316a7932000-08-24 01:01:26 +000079def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000080 global _urlopener
81 if not _urlopener:
82 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000083 return _urlopener.retrieve(url, filename, reporthook, data)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000084def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000085 if _urlopener:
86 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000087
88
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000089ftpcache = {}
90class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000091 """Class to open URLs.
92 This is a class rather than just a subroutine because we may need
93 more than one set of global protocol-specific options.
94 Note -- this is a base class for those who don't want the
95 automatic handling of errors type 302 (relocated) and 401
96 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000097
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000098 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000099
Guido van Rossumba311382000-08-24 16:18:04 +0000100 version = "Python-urllib/%s" % __version__
101
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000102 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000103 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000104 if proxies is None:
105 proxies = getproxies()
106 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
107 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000108 self.key_file = x509.get('key_file')
109 self.cert_file = x509.get('cert_file')
Guido van Rossumba311382000-08-24 16:18:04 +0000110 self.addheaders = [('User-agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000111 self.__tempfiles = []
112 self.__unlink = os.unlink # See cleanup()
113 self.tempcache = None
114 # Undocumented feature: if you assign {} to tempcache,
115 # it is used to cache files retrieved with
116 # self.retrieve(). This is not enabled by default
117 # since it does not work for changing documents (and I
118 # haven't got the logic to check expiration headers
119 # yet).
120 self.ftpcache = ftpcache
121 # Undocumented feature: you can use a different
122 # ftp cache by assigning to the .ftpcache member;
123 # in case you want logically independent URL openers
124 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000125
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000126 def __del__(self):
127 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000128
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000129 def close(self):
130 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000131
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000132 def cleanup(self):
133 # This code sometimes runs when the rest of this module
134 # has already been deleted, so it can't use any globals
135 # or import anything.
136 if self.__tempfiles:
137 for file in self.__tempfiles:
138 try:
139 self.__unlink(file)
Martin v. Löwis58682b72001-08-11 15:02:57 +0000140 except OSError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000141 pass
142 del self.__tempfiles[:]
143 if self.tempcache:
144 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000145
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000146 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000147 """Add a header to be used by the HTTP interface only
148 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000149 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000150
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000151 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000152 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000153 """Use URLopener().open(file) instead of open(file, 'r')."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000154 fullurl = unwrap(toBytes(fullurl))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000155 if self.tempcache and fullurl in self.tempcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000156 filename, headers = self.tempcache[fullurl]
157 fp = open(filename, 'rb')
158 return addinfourl(fp, headers, fullurl)
Martin v. Löwis1d994332000-12-03 18:30:10 +0000159 urltype, url = splittype(fullurl)
160 if not urltype:
161 urltype = 'file'
Raymond Hettinger54f02222002-06-01 14:18:47 +0000162 if urltype in self.proxies:
Martin v. Löwis1d994332000-12-03 18:30:10 +0000163 proxy = self.proxies[urltype]
164 urltype, proxyhost = splittype(proxy)
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000165 host, selector = splithost(proxyhost)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000166 url = (host, fullurl) # Signal special case to open_*()
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000167 else:
168 proxy = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000169 name = 'open_' + urltype
170 self.type = urltype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000171 if '-' in name:
172 # replace - with _
Guido van Rossumb2493f82000-12-15 15:01:37 +0000173 name = '_'.join(name.split('-'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000174 if not hasattr(self, name):
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000175 if proxy:
176 return self.open_unknown_proxy(proxy, fullurl, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000177 else:
178 return self.open_unknown(fullurl, data)
179 try:
180 if data is None:
181 return getattr(self, name)(url)
182 else:
183 return getattr(self, name)(url, data)
184 except socket.error, msg:
185 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000186
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000187 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000188 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000189 type, url = splittype(fullurl)
190 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000191
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000192 def open_unknown_proxy(self, proxy, fullurl, data=None):
193 """Overridable interface to open unknown URL type."""
194 type, url = splittype(fullurl)
195 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
196
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000197 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000198 def retrieve(self, url, filename=None, reporthook=None, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000199 """retrieve(url) returns (filename, None) for a local object
200 or (tempfilename, headers) for a remote object."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000201 url = unwrap(toBytes(url))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000202 if self.tempcache and url in self.tempcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000203 return self.tempcache[url]
204 type, url1 = splittype(url)
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000205 if filename is None and (not type or type == 'file'):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000206 try:
207 fp = self.open_local_file(url1)
208 hdrs = fp.info()
209 del fp
210 return url2pathname(splithost(url1)[1]), hdrs
211 except IOError, msg:
212 pass
Fred Drake316a7932000-08-24 01:01:26 +0000213 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000214 headers = fp.info()
215 if not filename:
216 import tempfile
217 garbage, path = splittype(url)
218 garbage, path = splithost(path or "")
219 path, garbage = splitquery(path or "")
220 path, garbage = splitattr(path or "")
221 suffix = os.path.splitext(path)[1]
222 filename = tempfile.mktemp(suffix)
223 self.__tempfiles.append(filename)
224 result = filename, headers
225 if self.tempcache is not None:
226 self.tempcache[url] = result
227 tfp = open(filename, 'wb')
228 bs = 1024*8
229 size = -1
230 blocknum = 1
231 if reporthook:
Raymond Hettinger54f02222002-06-01 14:18:47 +0000232 if "content-length" in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000233 size = int(headers["Content-Length"])
234 reporthook(0, bs, size)
235 block = fp.read(bs)
236 if reporthook:
237 reporthook(1, bs, size)
238 while block:
239 tfp.write(block)
240 block = fp.read(bs)
241 blocknum = blocknum + 1
242 if reporthook:
243 reporthook(blocknum, bs, size)
244 fp.close()
245 tfp.close()
246 del fp
247 del tfp
248 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000249
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000250 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000251
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000252 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000253 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000254 import httplib
255 user_passwd = None
Walter Dörwald65230a22002-06-03 15:58:32 +0000256 if isinstance(url, str):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000257 host, selector = splithost(url)
258 if host:
259 user_passwd, host = splituser(host)
260 host = unquote(host)
261 realhost = host
262 else:
263 host, selector = url
264 urltype, rest = splittype(selector)
265 url = rest
266 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000267 if urltype.lower() != 'http':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000268 realhost = None
269 else:
270 realhost, rest = splithost(rest)
271 if realhost:
272 user_passwd, realhost = splituser(realhost)
273 if user_passwd:
274 selector = "%s://%s%s" % (urltype, realhost, rest)
Tim Peters55c12d42001-08-09 18:04:14 +0000275 if proxy_bypass(realhost):
276 host = realhost
277
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000278 #print "proxy via http:", host, selector
279 if not host: raise IOError, ('http error', 'no host given')
280 if user_passwd:
281 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000282 auth = base64.encodestring(user_passwd).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000283 else:
284 auth = None
285 h = httplib.HTTP(host)
286 if data is not None:
287 h.putrequest('POST', selector)
288 h.putheader('Content-type', 'application/x-www-form-urlencoded')
289 h.putheader('Content-length', '%d' % len(data))
290 else:
291 h.putrequest('GET', selector)
292 if auth: h.putheader('Authorization', 'Basic %s' % auth)
293 if realhost: h.putheader('Host', realhost)
294 for args in self.addheaders: apply(h.putheader, args)
295 h.endheaders()
296 if data is not None:
Fred Drakeec3dfde2001-07-04 05:18:29 +0000297 h.send(data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000298 errcode, errmsg, headers = h.getreply()
299 fp = h.getfile()
300 if errcode == 200:
301 return addinfourl(fp, headers, "http:" + url)
302 else:
303 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000304 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000305 else:
306 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000307
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000308 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000309 """Handle http errors.
310 Derived class can override this, or provide specific handlers
311 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000312 # First check if there's a specific handler for this error
313 name = 'http_error_%d' % errcode
314 if hasattr(self, name):
315 method = getattr(self, name)
316 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000317 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000318 else:
319 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000320 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000321 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000322
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000323 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000324 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000325 void = fp.read()
326 fp.close()
327 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000328
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000329 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000330 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000331 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000332 import httplib
Fred Drake567ca8e2000-08-21 21:42:42 +0000333 user_passwd = None
Walter Dörwald65230a22002-06-03 15:58:32 +0000334 if isinstance(url, str):
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000335 host, selector = splithost(url)
Fred Drake567ca8e2000-08-21 21:42:42 +0000336 if host:
337 user_passwd, host = splituser(host)
338 host = unquote(host)
339 realhost = host
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000340 else:
341 host, selector = url
342 urltype, rest = splittype(selector)
Fred Drake567ca8e2000-08-21 21:42:42 +0000343 url = rest
344 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000345 if urltype.lower() != 'https':
Fred Drake567ca8e2000-08-21 21:42:42 +0000346 realhost = None
347 else:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000348 realhost, rest = splithost(rest)
Fred Drake567ca8e2000-08-21 21:42:42 +0000349 if realhost:
350 user_passwd, realhost = splituser(realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000351 if user_passwd:
352 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000353 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000354 if not host: raise IOError, ('https error', 'no host given')
355 if user_passwd:
356 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000357 auth = base64.encodestring(user_passwd).strip()
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000358 else:
359 auth = None
360 h = httplib.HTTPS(host, 0,
361 key_file=self.key_file,
362 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000363 if data is not None:
364 h.putrequest('POST', selector)
365 h.putheader('Content-type',
366 'application/x-www-form-urlencoded')
367 h.putheader('Content-length', '%d' % len(data))
368 else:
369 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000370 if auth: h.putheader('Authorization: Basic %s' % auth)
Fred Drake567ca8e2000-08-21 21:42:42 +0000371 if realhost: h.putheader('Host', realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000372 for args in self.addheaders: apply(h.putheader, args)
373 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000374 if data is not None:
Fred Drakeec3dfde2001-07-04 05:18:29 +0000375 h.send(data)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000376 errcode, errmsg, headers = h.getreply()
377 fp = h.getfile()
378 if errcode == 200:
Guido van Rossumb931bf32001-12-08 17:09:07 +0000379 return addinfourl(fp, headers, "https:" + url)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000380 else:
Fred Drake567ca8e2000-08-21 21:42:42 +0000381 if data is None:
382 return self.http_error(url, fp, errcode, errmsg, headers)
383 else:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000384 return self.http_error(url, fp, errcode, errmsg, headers,
385 data)
Fred Drake567ca8e2000-08-21 21:42:42 +0000386
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000387 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000388 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000389 import gopherlib
390 host, selector = splithost(url)
391 if not host: raise IOError, ('gopher error', 'no host given')
392 host = unquote(host)
393 type, selector = splitgophertype(selector)
394 selector, query = splitquery(selector)
395 selector = unquote(selector)
396 if query:
397 query = unquote(query)
398 fp = gopherlib.send_query(selector, query, host)
399 else:
400 fp = gopherlib.send_selector(selector, host)
401 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000402
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000403 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000404 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000405 if url[:2] == '//' and url[2:3] != '/':
406 return self.open_ftp(url)
407 else:
408 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000409
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000410 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000411 """Use local file."""
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000412 import mimetypes, mimetools, rfc822, StringIO
Guido van Rossumf0713d32001-08-09 17:43:35 +0000413 host, file = splithost(url)
414 localname = url2pathname(file)
Guido van Rossuma2da3052002-04-15 00:25:01 +0000415 try:
416 stats = os.stat(localname)
417 except OSError, e:
418 raise IOError(e.errno, e.strerror, e.filename)
Walter Dörwald92b48b72002-03-22 17:30:38 +0000419 size = stats.st_size
420 modified = rfc822.formatdate(stats.st_mtime)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000421 mtype = mimetypes.guess_type(url)[0]
422 headers = mimetools.Message(StringIO.StringIO(
Guido van Rossumf0713d32001-08-09 17:43:35 +0000423 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
424 (mtype or 'text/plain', size, modified)))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000425 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000426 urlfile = file
427 if file[:1] == '/':
428 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000429 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000430 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000431 host, port = splitport(host)
432 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000433 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000434 urlfile = file
435 if file[:1] == '/':
436 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000437 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000438 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000439 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000440
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000441 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000442 """Use FTP protocol."""
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000443 import mimetypes, mimetools, StringIO
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000444 host, path = splithost(url)
445 if not host: raise IOError, ('ftp error', 'no host given')
446 host, port = splitport(host)
447 user, host = splituser(host)
448 if user: user, passwd = splitpasswd(user)
449 else: passwd = None
450 host = unquote(host)
451 user = unquote(user or '')
452 passwd = unquote(passwd or '')
453 host = socket.gethostbyname(host)
454 if not port:
455 import ftplib
456 port = ftplib.FTP_PORT
457 else:
458 port = int(port)
459 path, attrs = splitattr(path)
460 path = unquote(path)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000461 dirs = path.split('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000462 dirs, file = dirs[:-1], dirs[-1]
463 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000464 if dirs and not dirs[0]: dirs[0] = '/'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000465 key = user, host, port, '/'.join(dirs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000466 # XXX thread unsafe!
467 if len(self.ftpcache) > MAXFTPCACHE:
468 # Prune the cache, rather arbitrarily
469 for k in self.ftpcache.keys():
470 if k != key:
471 v = self.ftpcache[k]
472 del self.ftpcache[k]
473 v.close()
474 try:
Raymond Hettinger54f02222002-06-01 14:18:47 +0000475 if not key in self.ftpcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000476 self.ftpcache[key] = \
477 ftpwrapper(user, passwd, host, port, dirs)
478 if not file: type = 'D'
479 else: type = 'I'
480 for attr in attrs:
481 attr, value = splitvalue(attr)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000482 if attr.lower() == 'type' and \
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000483 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000484 type = value.upper()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000485 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000486 mtype = mimetypes.guess_type("ftp:" + url)[0]
487 headers = ""
488 if mtype:
489 headers += "Content-Type: %s\n" % mtype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000490 if retrlen is not None and retrlen >= 0:
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000491 headers += "Content-Length: %d\n" % retrlen
492 headers = mimetools.Message(StringIO.StringIO(headers))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000493 return addinfourl(fp, headers, "ftp:" + url)
494 except ftperrors(), msg:
495 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000496
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000497 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000498 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000499 # ignore POSTed data
500 #
501 # syntax of data URLs:
502 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
503 # mediatype := [ type "/" subtype ] *( ";" parameter )
504 # data := *urlchar
505 # parameter := attribute "=" value
Neal Norwitzaad18492002-03-26 16:25:01 +0000506 import StringIO, mimetools
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000507 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000508 [type, data] = url.split(',', 1)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000509 except ValueError:
510 raise IOError, ('data error', 'bad data URL')
511 if not type:
512 type = 'text/plain;charset=US-ASCII'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000513 semi = type.rfind(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000514 if semi >= 0 and '=' not in type[semi:]:
515 encoding = type[semi+1:]
516 type = type[:semi]
517 else:
518 encoding = ''
519 msg = []
520 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
521 time.gmtime(time.time())))
522 msg.append('Content-type: %s' % type)
523 if encoding == 'base64':
524 import base64
525 data = base64.decodestring(data)
526 else:
527 data = unquote(data)
528 msg.append('Content-length: %d' % len(data))
529 msg.append('')
530 msg.append(data)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000531 msg = '\n'.join(msg)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000532 f = StringIO.StringIO(msg)
533 headers = mimetools.Message(f, 0)
534 f.fileno = None # needed for addinfourl
535 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000536
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000537
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000538class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000539 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000540
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000541 def __init__(self, *args):
542 apply(URLopener.__init__, (self,) + args)
543 self.auth_cache = {}
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000544 self.tries = 0
545 self.maxtries = 10
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000546
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000547 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000548 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000549 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000550
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000551 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000552 """Error 302 -- relocated (temporarily)."""
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000553 self.tries += 1
554 if self.maxtries and self.tries >= self.maxtries:
555 if hasattr(self, "http_error_500"):
556 meth = self.http_error_500
557 else:
558 meth = self.http_error_default
559 self.tries = 0
560 return meth(url, fp, 500,
561 "Internal Server Error: Redirect Recursion", headers)
562 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
563 data)
564 self.tries = 0
565 return result
566
567 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000568 if 'location' in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000569 newurl = headers['location']
Raymond Hettinger54f02222002-06-01 14:18:47 +0000570 elif 'uri' in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000571 newurl = headers['uri']
572 else:
573 return
574 void = fp.read()
575 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000576 # In case the server sent a relative URL, join with original:
Moshe Zadka5d87d472001-04-09 14:54:21 +0000577 newurl = basejoin(self.type + ":" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000578 if data is None:
579 return self.open(newurl)
580 else:
581 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000582
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000583 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000584 """Error 301 -- also relocated (permanently)."""
585 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000586
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000587 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000588 """Error 401 -- authentication required.
589 See this URL for a description of the basic authentication scheme:
590 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Raymond Hettinger54f02222002-06-01 14:18:47 +0000591 if not 'www-authenticate' in headers:
Tim Peters85ba6732001-02-28 08:26:44 +0000592 URLopener.http_error_default(self, url, fp,
Fred Drakec680ae82001-10-13 18:37:07 +0000593 errcode, errmsg, headers)
Moshe Zadkae99bd172001-02-27 06:27:04 +0000594 stuff = headers['www-authenticate']
595 import re
596 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
597 if not match:
Tim Peters85ba6732001-02-28 08:26:44 +0000598 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000599 errcode, errmsg, headers)
600 scheme, realm = match.groups()
601 if scheme.lower() != 'basic':
Tim Peters85ba6732001-02-28 08:26:44 +0000602 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000603 errcode, errmsg, headers)
604 name = 'retry_' + self.type + '_basic_auth'
605 if data is None:
606 return getattr(self,name)(url, realm)
607 else:
608 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000609
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000610 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000611 host, selector = splithost(url)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000612 i = host.find('@') + 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000613 host = host[i:]
614 user, passwd = self.get_user_passwd(host, realm, i)
615 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000616 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000617 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000618 if data is None:
619 return self.open(newurl)
620 else:
621 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000622
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000623 def retry_https_basic_auth(self, url, realm, data=None):
Tim Peterse1190062001-01-15 03:34:38 +0000624 host, selector = splithost(url)
625 i = host.find('@') + 1
626 host = host[i:]
627 user, passwd = self.get_user_passwd(host, realm, i)
628 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000629 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Tim Peterse1190062001-01-15 03:34:38 +0000630 newurl = '//' + host + selector
Guido van Rossumafc4f042001-01-15 18:31:13 +0000631 return self.open_https(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000632
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000633 def get_user_passwd(self, host, realm, clear_cache = 0):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000634 key = realm + '@' + host.lower()
Raymond Hettinger54f02222002-06-01 14:18:47 +0000635 if key in self.auth_cache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000636 if clear_cache:
637 del self.auth_cache[key]
638 else:
639 return self.auth_cache[key]
640 user, passwd = self.prompt_user_passwd(host, realm)
641 if user or passwd: self.auth_cache[key] = (user, passwd)
642 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000643
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000644 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000645 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000646 import getpass
647 try:
648 user = raw_input("Enter username for %s at %s: " % (realm,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000649 host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000650 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
651 (user, realm, host))
652 return user, passwd
653 except KeyboardInterrupt:
654 print
655 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000656
657
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000658# Utility functions
659
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000660_localhost = None
661def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000662 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000663 global _localhost
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000664 if _localhost is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000665 _localhost = socket.gethostbyname('localhost')
666 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000667
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000668_thishost = None
669def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000670 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000671 global _thishost
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000672 if _thishost is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000673 _thishost = socket.gethostbyname(socket.gethostname())
674 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000675
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000676_ftperrors = None
677def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000678 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000679 global _ftperrors
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000680 if _ftperrors is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000681 import ftplib
682 _ftperrors = ftplib.all_errors
683 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000684
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000685_noheaders = None
686def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000687 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000688 global _noheaders
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000689 if _noheaders is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000690 import mimetools
691 import StringIO
692 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
693 _noheaders.fp.close() # Recycle file descriptor
694 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000695
696
697# Utility classes
698
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000699class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000700 """Class used by open_ftp() for cache of open FTP connections."""
701
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000702 def __init__(self, user, passwd, host, port, dirs):
703 self.user = user
704 self.passwd = passwd
705 self.host = host
706 self.port = port
707 self.dirs = dirs
708 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000709
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000710 def init(self):
711 import ftplib
712 self.busy = 0
713 self.ftp = ftplib.FTP()
714 self.ftp.connect(self.host, self.port)
715 self.ftp.login(self.user, self.passwd)
716 for dir in self.dirs:
717 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000718
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000719 def retrfile(self, file, type):
720 import ftplib
721 self.endtransfer()
722 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
723 else: cmd = 'TYPE ' + type; isdir = 0
724 try:
725 self.ftp.voidcmd(cmd)
726 except ftplib.all_errors:
727 self.init()
728 self.ftp.voidcmd(cmd)
729 conn = None
730 if file and not isdir:
731 # Use nlst to see if the file exists at all
732 try:
733 self.ftp.nlst(file)
734 except ftplib.error_perm, reason:
735 raise IOError, ('ftp error', reason), sys.exc_info()[2]
736 # Restore the transfer mode!
737 self.ftp.voidcmd(cmd)
738 # Try to retrieve as a file
739 try:
740 cmd = 'RETR ' + file
741 conn = self.ftp.ntransfercmd(cmd)
742 except ftplib.error_perm, reason:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000743 if str(reason)[:3] != '550':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000744 raise IOError, ('ftp error', reason), sys.exc_info()[2]
745 if not conn:
746 # Set transfer mode to ASCII!
747 self.ftp.voidcmd('TYPE A')
748 # Try a directory listing
749 if file: cmd = 'LIST ' + file
750 else: cmd = 'LIST'
751 conn = self.ftp.ntransfercmd(cmd)
752 self.busy = 1
753 # Pass back both a suitably decorated object and a retrieval length
754 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000755 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000756 def endtransfer(self):
757 if not self.busy:
758 return
759 self.busy = 0
760 try:
761 self.ftp.voidresp()
762 except ftperrors():
763 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000764
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000765 def close(self):
766 self.endtransfer()
767 try:
768 self.ftp.close()
769 except ftperrors():
770 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000771
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000772class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000773 """Base class for addinfo and addclosehook."""
774
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000775 def __init__(self, fp):
776 self.fp = fp
777 self.read = self.fp.read
778 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000779 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
780 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000781
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000782 def __repr__(self):
783 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000784 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000785
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000786 def close(self):
787 self.read = None
788 self.readline = None
789 self.readlines = None
790 self.fileno = None
791 if self.fp: self.fp.close()
792 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000793
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000794class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000795 """Class to add a close hook to an open file."""
796
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000797 def __init__(self, fp, closehook, *hookargs):
798 addbase.__init__(self, fp)
799 self.closehook = closehook
800 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000801
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000802 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000803 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000804 if self.closehook:
805 apply(self.closehook, self.hookargs)
806 self.closehook = None
807 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000808
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000809class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000810 """class to add an info() method to an open file."""
811
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000812 def __init__(self, fp, headers):
813 addbase.__init__(self, fp)
814 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000815
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000816 def info(self):
817 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000818
Guido van Rossume6ad8911996-09-10 17:02:56 +0000819class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000820 """class to add info() and geturl() methods to an open file."""
821
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000822 def __init__(self, fp, headers, url):
823 addbase.__init__(self, fp)
824 self.headers = headers
825 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000826
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000827 def info(self):
828 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000829
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000830 def geturl(self):
831 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000832
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000833
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000834def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000835 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000836 type, path = splittype(url)
837 if type:
838 # if url is complete (i.e., it contains a type), return it
839 return url
840 host, path = splithost(path)
841 type, basepath = splittype(base) # inherit type from base
842 if host:
843 # if url contains host, just inherit type
844 if type: return type + '://' + host + path
845 else:
846 # no type inherited, so url must have started with //
847 # just return it
848 return url
849 host, basepath = splithost(basepath) # inherit host
Thomas Wouters7e474022000-07-16 12:04:32 +0000850 basepath, basetag = splittag(basepath) # remove extraneous cruft
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000851 basepath, basequery = splitquery(basepath) # idem
852 if path[:1] != '/':
853 # non-absolute path name
854 if path[:1] in ('#', '?'):
855 # path is just a tag or query, attach to basepath
856 i = len(basepath)
857 else:
858 # else replace last component
Guido van Rossumb2493f82000-12-15 15:01:37 +0000859 i = basepath.rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000860 if i < 0:
861 # basepath not absolute
862 if host:
863 # host present, make absolute
864 basepath = '/'
865 else:
866 # else keep non-absolute
867 basepath = ''
868 else:
869 # remove last file component
870 basepath = basepath[:i+1]
871 # Interpret ../ (important because of symlinks)
872 while basepath and path[:3] == '../':
873 path = path[3:]
Guido van Rossumb2493f82000-12-15 15:01:37 +0000874 i = basepath[:-1].rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000875 if i > 0:
876 basepath = basepath[:i+1]
877 elif i == 0:
878 basepath = '/'
879 break
880 else:
881 basepath = ''
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000882
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000883 path = basepath + path
Guido van Rossumb8bf3be2001-04-15 20:47:33 +0000884 if host and path and path[0] != '/':
885 path = '/' + path
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000886 if type and host: return type + '://' + host + path
887 elif type: return type + ':' + path
888 elif host: return '//' + host + path # don't know what this means
889 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000890
891
Guido van Rossum7c395db1994-07-04 22:14:49 +0000892# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000893# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000894# splittype('type:opaquestring') --> 'type', 'opaquestring'
895# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000896# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
897# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000898# splitport('host:port') --> 'host', 'port'
899# splitquery('/path?query') --> '/path', 'query'
900# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000901# splitattr('/path;attr1=value1;attr2=value2;...') ->
902# '/path', ['attr1=value1', 'attr2=value2', ...]
903# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000904# splitgophertype('/Xselector') --> 'X', 'selector'
905# unquote('abc%20def') -> 'abc def'
906# quote('abc def') -> 'abc%20def')
907
Walter Dörwald65230a22002-06-03 15:58:32 +0000908try:
909 unicode
910except NameError:
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000911 def _is_unicode(x):
912 return 0
Walter Dörwald65230a22002-06-03 15:58:32 +0000913else:
914 def _is_unicode(x):
915 return isinstance(x, unicode)
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000916
Martin v. Löwis1d994332000-12-03 18:30:10 +0000917def toBytes(url):
918 """toBytes(u"URL") --> 'URL'."""
919 # Most URL schemes require ASCII. If that changes, the conversion
920 # can be relaxed
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000921 if _is_unicode(url):
Martin v. Löwis1d994332000-12-03 18:30:10 +0000922 try:
923 url = url.encode("ASCII")
924 except UnicodeError:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000925 raise UnicodeError("URL " + repr(url) +
926 " contains non-ASCII characters")
Martin v. Löwis1d994332000-12-03 18:30:10 +0000927 return url
928
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000929def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000930 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Guido van Rossumb2493f82000-12-15 15:01:37 +0000931 url = url.strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000932 if url[:1] == '<' and url[-1:] == '>':
Guido van Rossumb2493f82000-12-15 15:01:37 +0000933 url = url[1:-1].strip()
934 if url[:4] == 'URL:': url = url[4:].strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000935 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000936
Guido van Rossum332e1441997-09-29 23:23:46 +0000937_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000938def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000939 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000940 global _typeprog
941 if _typeprog is None:
942 import re
943 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000944
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000945 match = _typeprog.match(url)
946 if match:
947 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000948 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000949 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000950
Guido van Rossum332e1441997-09-29 23:23:46 +0000951_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000952def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000953 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000954 global _hostprog
955 if _hostprog is None:
956 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000957 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000958
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000959 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000960 if match: return match.group(1, 2)
961 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000962
Guido van Rossum332e1441997-09-29 23:23:46 +0000963_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000964def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000965 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000966 global _userprog
967 if _userprog is None:
968 import re
969 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000970
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000971 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000972 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000973 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000974
Guido van Rossum332e1441997-09-29 23:23:46 +0000975_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000976def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000977 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000978 global _passwdprog
979 if _passwdprog is None:
980 import re
981 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000982
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000983 match = _passwdprog.match(user)
984 if match: return match.group(1, 2)
985 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000986
Guido van Rossume7b146f2000-02-04 15:28:42 +0000987# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000988_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000989def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000990 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000991 global _portprog
992 if _portprog is None:
993 import re
994 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000995
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000996 match = _portprog.match(host)
997 if match: return match.group(1, 2)
998 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000999
Guido van Rossum332e1441997-09-29 23:23:46 +00001000_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +00001001def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001002 """Split host and port, returning numeric port.
1003 Return given default port if no ':' found; defaults to -1.
1004 Return numerical port if a valid number are found after ':'.
1005 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001006 global _nportprog
1007 if _nportprog is None:
1008 import re
1009 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001010
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001011 match = _nportprog.match(host)
1012 if match:
1013 host, port = match.group(1, 2)
1014 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001015 if not port: raise ValueError, "no digits"
1016 nport = int(port)
1017 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001018 nport = None
1019 return host, nport
1020 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +00001021
Guido van Rossum332e1441997-09-29 23:23:46 +00001022_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001023def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001024 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001025 global _queryprog
1026 if _queryprog is None:
1027 import re
1028 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001029
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001030 match = _queryprog.match(url)
1031 if match: return match.group(1, 2)
1032 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001033
Guido van Rossum332e1441997-09-29 23:23:46 +00001034_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001035def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001036 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001037 global _tagprog
1038 if _tagprog is None:
1039 import re
1040 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001041
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001042 match = _tagprog.match(url)
1043 if match: return match.group(1, 2)
1044 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001045
Guido van Rossum7c395db1994-07-04 22:14:49 +00001046def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001047 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1048 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Guido van Rossumb2493f82000-12-15 15:01:37 +00001049 words = url.split(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001050 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +00001051
Guido van Rossum332e1441997-09-29 23:23:46 +00001052_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001053def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001054 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001055 global _valueprog
1056 if _valueprog is None:
1057 import re
1058 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001059
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001060 match = _valueprog.match(attr)
1061 if match: return match.group(1, 2)
1062 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001063
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001064def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001065 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001066 if selector[:1] == '/' and selector[1:2]:
1067 return selector[1], selector[2:]
1068 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001069
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001070def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001071 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001072 mychr = chr
Guido van Rossumb2493f82000-12-15 15:01:37 +00001073 myatoi = int
1074 list = s.split('%')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001075 res = [list[0]]
1076 myappend = res.append
1077 del list[0]
1078 for item in list:
1079 if item[1:2]:
1080 try:
1081 myappend(mychr(myatoi(item[:2], 16))
1082 + item[2:])
Martin v. Löwis58682b72001-08-11 15:02:57 +00001083 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001084 myappend('%' + item)
1085 else:
1086 myappend('%' + item)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001087 return "".join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001088
Guido van Rossum0564e121996-12-13 14:47:36 +00001089def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001090 """unquote('%7e/abc+def') -> '~/abc def'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001091 if '+' in s:
1092 # replace '+' with ' '
Guido van Rossumb2493f82000-12-15 15:01:37 +00001093 s = ' '.join(s.split('+'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001094 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001095
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001096always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001097 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001098 '0123456789' '_.-')
1099
1100_fast_safe_test = always_safe + '/'
1101_fast_safe = None
1102
1103def _fast_quote(s):
1104 global _fast_safe
1105 if _fast_safe is None:
1106 _fast_safe = {}
1107 for c in _fast_safe_test:
1108 _fast_safe[c] = c
1109 res = list(s)
1110 for i in range(len(res)):
1111 c = res[i]
Raymond Hettinger54f02222002-06-01 14:18:47 +00001112 if not c in _fast_safe:
Guido van Rossume27a7b82001-01-19 03:28:15 +00001113 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001114 return ''.join(res)
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001115
Guido van Rossum7c395db1994-07-04 22:14:49 +00001116def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001117 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001118
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001119 Each part of a URL, e.g. the path info, the query, etc., has a
1120 different set of reserved characters that must be quoted.
1121
1122 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1123 the following reserved characters.
1124
1125 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1126 "$" | ","
1127
1128 Each of these characters is reserved in some component of a URL,
1129 but not necessarily in all of them.
1130
1131 By default, the quote function is intended for quoting the path
1132 section of a URL. Thus, it will not encode '/'. This character
1133 is reserved, but in typical usage the quote function is being
1134 called on a path where the existing slash characters are used as
1135 reserved characters.
1136 """
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001137 safe = always_safe + safe
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001138 if _fast_safe_test == safe:
1139 return _fast_quote(s)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001140 res = list(s)
1141 for i in range(len(res)):
1142 c = res[i]
1143 if c not in safe:
Guido van Rossume27a7b82001-01-19 03:28:15 +00001144 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001145 return ''.join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001146
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001147def quote_plus(s, safe = ''):
1148 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001149 if ' ' in s:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001150 l = s.split(' ')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001151 for i in range(len(l)):
1152 l[i] = quote(l[i], safe)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001153 return '+'.join(l)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001154 else:
1155 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001156
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001157def urlencode(query,doseq=0):
1158 """Encode a sequence of two-element tuples or dictionary into a URL query string.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001159
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001160 If any values in the query arg are sequences and doseq is true, each
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001161 sequence element is converted to a separate parameter.
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001162
1163 If the query arg is a sequence of two-element tuples, the order of the
1164 parameters in the output will match the order of parameters in the
1165 input.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001166 """
Tim Peters658cba62001-02-09 20:06:00 +00001167
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001168 if hasattr(query,"items"):
1169 # mapping objects
1170 query = query.items()
1171 else:
1172 # it's a bother at times that strings and string-like objects are
1173 # sequences...
1174 try:
1175 # non-sequence items should not work with len()
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001176 # non-empty strings will fail this
Walter Dörwald65230a22002-06-03 15:58:32 +00001177 if len(query) and not isinstance(query[0], tuple):
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001178 raise TypeError
1179 # zero-length sequences of all types will get here and succeed,
1180 # but that's a minor nit - since the original implementation
1181 # allowed empty dicts that type of behavior probably should be
1182 # preserved for consistency
1183 except TypeError:
1184 ty,va,tb = sys.exc_info()
1185 raise TypeError, "not a valid non-string sequence or mapping object", tb
1186
Guido van Rossume7b146f2000-02-04 15:28:42 +00001187 l = []
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001188 if not doseq:
1189 # preserve old behavior
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001190 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001191 k = quote_plus(str(k))
1192 v = quote_plus(str(v))
1193 l.append(k + '=' + v)
1194 else:
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001195 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001196 k = quote_plus(str(k))
Walter Dörwald65230a22002-06-03 15:58:32 +00001197 if isinstance(v, str):
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001198 v = quote_plus(v)
1199 l.append(k + '=' + v)
Guido van Rossum4b46c0a2002-05-24 17:58:05 +00001200 elif _is_unicode(v):
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001201 # is there a reasonable way to convert to ASCII?
1202 # encode generates a string, but "replace" or "ignore"
1203 # lose information and "strict" can raise UnicodeError
1204 v = quote_plus(v.encode("ASCII","replace"))
1205 l.append(k + '=' + v)
1206 else:
1207 try:
1208 # is this a sufficient test for sequence-ness?
1209 x = len(v)
1210 except TypeError:
1211 # not a sequence
1212 v = quote_plus(str(v))
1213 l.append(k + '=' + v)
1214 else:
1215 # loop over the sequence
1216 for elt in v:
1217 l.append(k + '=' + quote_plus(str(elt)))
Guido van Rossumb2493f82000-12-15 15:01:37 +00001218 return '&'.join(l)
Guido van Rossum810a3391998-07-22 21:33:23 +00001219
Guido van Rossum442e7201996-03-20 15:33:11 +00001220# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001221def getproxies_environment():
1222 """Return a dictionary of scheme -> proxy server URL mappings.
1223
1224 Scan the environment for variables named <scheme>_proxy;
1225 this seems to be the standard convention. If you need a
1226 different way, you can pass a proxies dictionary to the
1227 [Fancy]URLopener constructor.
1228
1229 """
1230 proxies = {}
1231 for name, value in os.environ.items():
Guido van Rossumb2493f82000-12-15 15:01:37 +00001232 name = name.lower()
Mark Hammond4f570b92000-07-26 07:04:38 +00001233 if value and name[-6:] == '_proxy':
1234 proxies[name[:-6]] = value
1235 return proxies
1236
Guido van Rossum4163e701998-08-06 13:39:09 +00001237if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001238 def getproxies():
1239 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001240
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001241 By convention the mac uses Internet Config to store
1242 proxies. An HTTP proxy, for instance, is stored under
1243 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001244
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001245 """
1246 try:
1247 import ic
1248 except ImportError:
1249 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001250
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001251 try:
1252 config = ic.IC()
1253 except ic.error:
1254 return {}
1255 proxies = {}
1256 # HTTP:
Raymond Hettinger54f02222002-06-01 14:18:47 +00001257 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001258 try:
1259 value = config['HTTPProxyHost']
1260 except ic.error:
1261 pass
1262 else:
1263 proxies['http'] = 'http://%s' % value
1264 # FTP: XXXX To be done.
1265 # Gopher: XXXX To be done.
1266 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001267
Tim Peters55c12d42001-08-09 18:04:14 +00001268 def proxy_bypass(x):
1269 return 0
1270
Mark Hammond4f570b92000-07-26 07:04:38 +00001271elif os.name == 'nt':
1272 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001273 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001274
1275 Win32 uses the registry to store proxies.
1276
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001277 """
1278 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001279 try:
1280 import _winreg
1281 except ImportError:
1282 # Std module, so should be around - but you never know!
1283 return proxies
1284 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001285 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1286 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001287 proxyEnable = _winreg.QueryValueEx(internetSettings,
1288 'ProxyEnable')[0]
1289 if proxyEnable:
1290 # Returned as Unicode but problems if not converted to ASCII
1291 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1292 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001293 if '=' in proxyServer:
1294 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001295 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001296 protocol, address = p.split('=', 1)
Guido van Rossumb955d6c2002-03-31 23:38:48 +00001297 # See if address has a type:// prefix
Guido van Rossum64e5aa92002-04-02 14:38:16 +00001298 import re
1299 if not re.match('^([^/:]+)://', address):
Guido van Rossumb955d6c2002-03-31 23:38:48 +00001300 address = '%s://%s' % (protocol, address)
1301 proxies[protocol] = address
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001302 else:
1303 # Use one setting for all protocols
1304 if proxyServer[:5] == 'http:':
1305 proxies['http'] = proxyServer
1306 else:
1307 proxies['http'] = 'http://%s' % proxyServer
1308 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001309 internetSettings.Close()
1310 except (WindowsError, ValueError, TypeError):
1311 # Either registry key not found etc, or the value in an
1312 # unexpected format.
1313 # proxies already set up to be empty so nothing to do
1314 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001315 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001316
Mark Hammond4f570b92000-07-26 07:04:38 +00001317 def getproxies():
1318 """Return a dictionary of scheme -> proxy server URL mappings.
1319
1320 Returns settings gathered from the environment, if specified,
1321 or the registry.
1322
1323 """
1324 return getproxies_environment() or getproxies_registry()
Tim Peters55c12d42001-08-09 18:04:14 +00001325
1326 def proxy_bypass(host):
1327 try:
1328 import _winreg
1329 import re
Tim Peters55c12d42001-08-09 18:04:14 +00001330 except ImportError:
1331 # Std modules, so should be around - but you never know!
1332 return 0
1333 try:
1334 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1335 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1336 proxyEnable = _winreg.QueryValueEx(internetSettings,
1337 'ProxyEnable')[0]
1338 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1339 'ProxyOverride')[0])
1340 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1341 except WindowsError:
1342 return 0
1343 if not proxyEnable or not proxyOverride:
1344 return 0
1345 # try to make a host list from name and IP address.
1346 host = [host]
1347 try:
1348 addr = socket.gethostbyname(host[0])
1349 if addr != host:
1350 host.append(addr)
1351 except socket.error:
1352 pass
1353 # make a check value list from the registry entry: replace the
1354 # '<local>' string by the localhost entry and the corresponding
1355 # canonical entry.
1356 proxyOverride = proxyOverride.split(';')
1357 i = 0
1358 while i < len(proxyOverride):
1359 if proxyOverride[i] == '<local>':
1360 proxyOverride[i:i+1] = ['localhost',
1361 '127.0.0.1',
1362 socket.gethostname(),
1363 socket.gethostbyname(
1364 socket.gethostname())]
1365 i += 1
1366 # print proxyOverride
1367 # now check if we match one of the registry values.
1368 for test in proxyOverride:
Tim Petersab9ba272001-08-09 21:40:30 +00001369 test = test.replace(".", r"\.") # mask dots
1370 test = test.replace("*", r".*") # change glob sequence
1371 test = test.replace("?", r".") # change glob char
Tim Peters55c12d42001-08-09 18:04:14 +00001372 for val in host:
1373 # print "%s <--> %s" %( test, val )
1374 if re.match(test, val, re.I):
1375 return 1
1376 return 0
1377
Mark Hammond4f570b92000-07-26 07:04:38 +00001378else:
1379 # By default use environment variables
1380 getproxies = getproxies_environment
1381
Tim Peters55c12d42001-08-09 18:04:14 +00001382 def proxy_bypass(host):
1383 return 0
Guido van Rossum442e7201996-03-20 15:33:11 +00001384
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001385# Test and time quote() and unquote()
1386def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001387 s = ''
1388 for i in range(256): s = s + chr(i)
1389 s = s*4
1390 t0 = time.time()
1391 qs = quote(s)
1392 uqs = unquote(qs)
1393 t1 = time.time()
1394 if uqs != s:
1395 print 'Wrong!'
1396 print `s`
1397 print `qs`
1398 print `uqs`
1399 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001400
1401
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001402def reporthook(blocknum, blocksize, totalsize):
1403 # Report during remote transfers
Guido van Rossumb2493f82000-12-15 15:01:37 +00001404 print "Block number: %d, Block size: %d, Total size: %d" % (
1405 blocknum, blocksize, totalsize)
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001406
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001407# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001408def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001409 if not args:
1410 args = [
1411 '/etc/passwd',
1412 'file:/etc/passwd',
1413 'file://localhost/etc/passwd',
Andrew M. Kuchling56a42352002-03-18 22:18:46 +00001414 'ftp://ftp.python.org/pub/python/README',
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001415## 'gopher://gopher.micro.umn.edu/1/',
1416 'http://www.python.org/index.html',
1417 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001418 if hasattr(URLopener, "open_https"):
1419 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001420 try:
1421 for url in args:
1422 print '-'*10, url, '-'*10
1423 fn, h = urlretrieve(url, None, reporthook)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001424 print fn
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001425 if h:
1426 print '======'
1427 for k in h.keys(): print k + ':', h[k]
1428 print '======'
1429 fp = open(fn, 'rb')
1430 data = fp.read()
1431 del fp
1432 if '\r' in data:
1433 table = string.maketrans("", "")
Guido van Rossumb2493f82000-12-15 15:01:37 +00001434 data = data.translate(table, "\r")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001435 print data
1436 fn, h = None, None
1437 print '-'*40
1438 finally:
1439 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001440
Guido van Rossum23490151998-06-25 02:39:00 +00001441def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001442 import getopt, sys
1443 try:
1444 opts, args = getopt.getopt(sys.argv[1:], "th")
1445 except getopt.error, msg:
1446 print msg
1447 print "Use -h for help"
1448 return
1449 t = 0
1450 for o, a in opts:
1451 if o == '-t':
1452 t = t + 1
1453 if o == '-h':
1454 print "Usage: python urllib.py [-t] [url ...]"
1455 print "-t runs self-test;",
1456 print "otherwise, contents of urls are printed"
1457 return
1458 if t:
1459 if t > 1:
1460 test1()
1461 test(args)
1462 else:
1463 if not args:
1464 print "Use -h for help"
1465 for url in args:
1466 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001467
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001468# Run test program when run as a script
1469if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001470 main()