blob: 0a936c438c6bea83fdb2db8c731b3e7761413946 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossumf0713d32001-08-09 17:43:35 +000028import stat
29import time
Guido van Rossum3c8484e1996-11-20 22:02:24 +000030import sys
Martin v. Löwis1d994332000-12-03 18:30:10 +000031import types
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000032
Skip Montanaro40fc1602001-03-01 04:27:19 +000033__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
34 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
Skip Montanaro44d5e0c2001-03-13 19:47:16 +000035 "urlencode", "url2pathname", "pathname2url", "splittag",
36 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
37 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
38 "splitnport", "splitquery", "splitattr", "splitvalue",
39 "splitgophertype", "getproxies"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000040
Guido van Rossumb2493f82000-12-15 15:01:37 +000041__version__ = '1.15' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000042
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000043MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000044
Jack Jansendc3e3f61995-12-15 13:22:13 +000045# Helper for non-unix systems
46if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000047 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000048elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000049 from nturl2path import url2pathname, pathname2url
Guido van Rossumd74fb6b2001-03-02 06:43:49 +000050elif os.name == 'riscos':
51 from rourl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000052else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000053 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000054 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000055 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000056 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000057
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000058# This really consists of two pieces:
59# (1) a class which handles opening of all sorts of URLs
60# (plus assorted utilities etc.)
61# (2) a set of functions for parsing URLs
62# XXX Should these be separated out into different modules?
63
64
65# Shortcut for basic usage
66_urlopener = None
Guido van Rossumbd013741996-12-10 16:00:28 +000067def urlopen(url, data=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000068 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000069 global _urlopener
70 if not _urlopener:
71 _urlopener = FancyURLopener()
72 if data is None:
73 return _urlopener.open(url)
74 else:
75 return _urlopener.open(url, data)
Fred Drake316a7932000-08-24 01:01:26 +000076def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000077 global _urlopener
78 if not _urlopener:
79 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000080 return _urlopener.retrieve(url, filename, reporthook, data)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000081def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000082 if _urlopener:
83 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000084
85
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000086ftpcache = {}
87class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000088 """Class to open URLs.
89 This is a class rather than just a subroutine because we may need
90 more than one set of global protocol-specific options.
91 Note -- this is a base class for those who don't want the
92 automatic handling of errors type 302 (relocated) and 401
93 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000094
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000095 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000096
Guido van Rossumba311382000-08-24 16:18:04 +000097 version = "Python-urllib/%s" % __version__
98
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000099 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000100 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000101 if proxies is None:
102 proxies = getproxies()
103 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
104 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000105 self.key_file = x509.get('key_file')
106 self.cert_file = x509.get('cert_file')
Guido van Rossumba311382000-08-24 16:18:04 +0000107 self.addheaders = [('User-agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000108 self.__tempfiles = []
109 self.__unlink = os.unlink # See cleanup()
110 self.tempcache = None
111 # Undocumented feature: if you assign {} to tempcache,
112 # it is used to cache files retrieved with
113 # self.retrieve(). This is not enabled by default
114 # since it does not work for changing documents (and I
115 # haven't got the logic to check expiration headers
116 # yet).
117 self.ftpcache = ftpcache
118 # Undocumented feature: you can use a different
119 # ftp cache by assigning to the .ftpcache member;
120 # in case you want logically independent URL openers
121 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000122
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000123 def __del__(self):
124 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000125
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000126 def close(self):
127 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000128
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000129 def cleanup(self):
130 # This code sometimes runs when the rest of this module
131 # has already been deleted, so it can't use any globals
132 # or import anything.
133 if self.__tempfiles:
134 for file in self.__tempfiles:
135 try:
136 self.__unlink(file)
Martin v. Löwis58682b72001-08-11 15:02:57 +0000137 except OSError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000138 pass
139 del self.__tempfiles[:]
140 if self.tempcache:
141 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000142
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000143 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000144 """Add a header to be used by the HTTP interface only
145 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000146 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000147
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000148 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000149 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000150 """Use URLopener().open(file) instead of open(file, 'r')."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000151 fullurl = unwrap(toBytes(fullurl))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000152 if self.tempcache and self.tempcache.has_key(fullurl):
153 filename, headers = self.tempcache[fullurl]
154 fp = open(filename, 'rb')
155 return addinfourl(fp, headers, fullurl)
Martin v. Löwis1d994332000-12-03 18:30:10 +0000156 urltype, url = splittype(fullurl)
157 if not urltype:
158 urltype = 'file'
159 if self.proxies.has_key(urltype):
160 proxy = self.proxies[urltype]
161 urltype, proxyhost = splittype(proxy)
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000162 host, selector = splithost(proxyhost)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000163 url = (host, fullurl) # Signal special case to open_*()
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000164 else:
165 proxy = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000166 name = 'open_' + urltype
167 self.type = urltype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000168 if '-' in name:
169 # replace - with _
Guido van Rossumb2493f82000-12-15 15:01:37 +0000170 name = '_'.join(name.split('-'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000171 if not hasattr(self, name):
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000172 if proxy:
173 return self.open_unknown_proxy(proxy, fullurl, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000174 else:
175 return self.open_unknown(fullurl, data)
176 try:
177 if data is None:
178 return getattr(self, name)(url)
179 else:
180 return getattr(self, name)(url, data)
181 except socket.error, msg:
182 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000183
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000184 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000185 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000186 type, url = splittype(fullurl)
187 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000188
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000189 def open_unknown_proxy(self, proxy, fullurl, data=None):
190 """Overridable interface to open unknown URL type."""
191 type, url = splittype(fullurl)
192 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
193
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000194 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000195 def retrieve(self, url, filename=None, reporthook=None, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000196 """retrieve(url) returns (filename, None) for a local object
197 or (tempfilename, headers) for a remote object."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000198 url = unwrap(toBytes(url))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000199 if self.tempcache and self.tempcache.has_key(url):
200 return self.tempcache[url]
201 type, url1 = splittype(url)
202 if not filename and (not type or type == 'file'):
203 try:
204 fp = self.open_local_file(url1)
205 hdrs = fp.info()
206 del fp
207 return url2pathname(splithost(url1)[1]), hdrs
208 except IOError, msg:
209 pass
Fred Drake316a7932000-08-24 01:01:26 +0000210 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000211 headers = fp.info()
212 if not filename:
213 import tempfile
214 garbage, path = splittype(url)
215 garbage, path = splithost(path or "")
216 path, garbage = splitquery(path or "")
217 path, garbage = splitattr(path or "")
218 suffix = os.path.splitext(path)[1]
219 filename = tempfile.mktemp(suffix)
220 self.__tempfiles.append(filename)
221 result = filename, headers
222 if self.tempcache is not None:
223 self.tempcache[url] = result
224 tfp = open(filename, 'wb')
225 bs = 1024*8
226 size = -1
227 blocknum = 1
228 if reporthook:
229 if headers.has_key("content-length"):
230 size = int(headers["Content-Length"])
231 reporthook(0, bs, size)
232 block = fp.read(bs)
233 if reporthook:
234 reporthook(1, bs, size)
235 while block:
236 tfp.write(block)
237 block = fp.read(bs)
238 blocknum = blocknum + 1
239 if reporthook:
240 reporthook(blocknum, bs, size)
241 fp.close()
242 tfp.close()
243 del fp
244 del tfp
245 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000246
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000247 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000248
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000249 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000250 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000251 import httplib
252 user_passwd = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000253 if type(url) is types.StringType:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000254 host, selector = splithost(url)
255 if host:
256 user_passwd, host = splituser(host)
257 host = unquote(host)
258 realhost = host
259 else:
260 host, selector = url
261 urltype, rest = splittype(selector)
262 url = rest
263 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000264 if urltype.lower() != 'http':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000265 realhost = None
266 else:
267 realhost, rest = splithost(rest)
268 if realhost:
269 user_passwd, realhost = splituser(realhost)
270 if user_passwd:
271 selector = "%s://%s%s" % (urltype, realhost, rest)
Tim Peters55c12d42001-08-09 18:04:14 +0000272 if proxy_bypass(realhost):
273 host = realhost
274
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000275 #print "proxy via http:", host, selector
276 if not host: raise IOError, ('http error', 'no host given')
277 if user_passwd:
278 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000279 auth = base64.encodestring(user_passwd).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000280 else:
281 auth = None
282 h = httplib.HTTP(host)
283 if data is not None:
284 h.putrequest('POST', selector)
285 h.putheader('Content-type', 'application/x-www-form-urlencoded')
286 h.putheader('Content-length', '%d' % len(data))
287 else:
288 h.putrequest('GET', selector)
289 if auth: h.putheader('Authorization', 'Basic %s' % auth)
290 if realhost: h.putheader('Host', realhost)
291 for args in self.addheaders: apply(h.putheader, args)
292 h.endheaders()
293 if data is not None:
Fred Drakeec3dfde2001-07-04 05:18:29 +0000294 h.send(data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000295 errcode, errmsg, headers = h.getreply()
296 fp = h.getfile()
297 if errcode == 200:
298 return addinfourl(fp, headers, "http:" + url)
299 else:
300 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000301 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000302 else:
303 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000304
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000305 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000306 """Handle http errors.
307 Derived class can override this, or provide specific handlers
308 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000309 # First check if there's a specific handler for this error
310 name = 'http_error_%d' % errcode
311 if hasattr(self, name):
312 method = getattr(self, name)
313 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000314 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000315 else:
316 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000317 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000318 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000319
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000320 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000321 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000322 void = fp.read()
323 fp.close()
324 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000325
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000326 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000327 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000328 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000329 import httplib
Fred Drake567ca8e2000-08-21 21:42:42 +0000330 user_passwd = None
Moshe Zadkab2a0a832001-01-08 07:09:25 +0000331 if type(url) is types.StringType:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000332 host, selector = splithost(url)
Fred Drake567ca8e2000-08-21 21:42:42 +0000333 if host:
334 user_passwd, host = splituser(host)
335 host = unquote(host)
336 realhost = host
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000337 else:
338 host, selector = url
339 urltype, rest = splittype(selector)
Fred Drake567ca8e2000-08-21 21:42:42 +0000340 url = rest
341 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000342 if urltype.lower() != 'https':
Fred Drake567ca8e2000-08-21 21:42:42 +0000343 realhost = None
344 else:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000345 realhost, rest = splithost(rest)
Fred Drake567ca8e2000-08-21 21:42:42 +0000346 if realhost:
347 user_passwd, realhost = splituser(realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000348 if user_passwd:
349 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000350 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000351 if not host: raise IOError, ('https error', 'no host given')
352 if user_passwd:
353 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000354 auth = base64.encodestring(user_passwd).strip()
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000355 else:
356 auth = None
357 h = httplib.HTTPS(host, 0,
358 key_file=self.key_file,
359 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000360 if data is not None:
361 h.putrequest('POST', selector)
362 h.putheader('Content-type',
363 'application/x-www-form-urlencoded')
364 h.putheader('Content-length', '%d' % len(data))
365 else:
366 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000367 if auth: h.putheader('Authorization: Basic %s' % auth)
Fred Drake567ca8e2000-08-21 21:42:42 +0000368 if realhost: h.putheader('Host', realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000369 for args in self.addheaders: apply(h.putheader, args)
370 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000371 if data is not None:
Fred Drakeec3dfde2001-07-04 05:18:29 +0000372 h.send(data)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000373 errcode, errmsg, headers = h.getreply()
374 fp = h.getfile()
375 if errcode == 200:
Guido van Rossumb931bf32001-12-08 17:09:07 +0000376 return addinfourl(fp, headers, "https:" + url)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000377 else:
Fred Drake567ca8e2000-08-21 21:42:42 +0000378 if data is None:
379 return self.http_error(url, fp, errcode, errmsg, headers)
380 else:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000381 return self.http_error(url, fp, errcode, errmsg, headers,
382 data)
Fred Drake567ca8e2000-08-21 21:42:42 +0000383
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000384 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000385 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000386 import gopherlib
387 host, selector = splithost(url)
388 if not host: raise IOError, ('gopher error', 'no host given')
389 host = unquote(host)
390 type, selector = splitgophertype(selector)
391 selector, query = splitquery(selector)
392 selector = unquote(selector)
393 if query:
394 query = unquote(query)
395 fp = gopherlib.send_query(selector, query, host)
396 else:
397 fp = gopherlib.send_selector(selector, host)
398 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000399
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000400 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000401 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000402 if url[:2] == '//' and url[2:3] != '/':
403 return self.open_ftp(url)
404 else:
405 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000406
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000407 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000408 """Use local file."""
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000409 import mimetypes, mimetools, rfc822, StringIO
Guido van Rossumf0713d32001-08-09 17:43:35 +0000410 host, file = splithost(url)
411 localname = url2pathname(file)
412 stats = os.stat(localname)
413 size = stats[stat.ST_SIZE]
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000414 modified = rfc822.formatdate(stats[stat.ST_MTIME])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000415 mtype = mimetypes.guess_type(url)[0]
416 headers = mimetools.Message(StringIO.StringIO(
Guido van Rossumf0713d32001-08-09 17:43:35 +0000417 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
418 (mtype or 'text/plain', size, modified)))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000419 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000420 urlfile = file
421 if file[:1] == '/':
422 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000423 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000424 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000425 host, port = splitport(host)
426 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000427 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000428 urlfile = file
429 if file[:1] == '/':
430 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000431 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000432 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000433 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000434
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000435 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000436 """Use FTP protocol."""
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000437 import mimetypes, mimetools, StringIO
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000438 host, path = splithost(url)
439 if not host: raise IOError, ('ftp error', 'no host given')
440 host, port = splitport(host)
441 user, host = splituser(host)
442 if user: user, passwd = splitpasswd(user)
443 else: passwd = None
444 host = unquote(host)
445 user = unquote(user or '')
446 passwd = unquote(passwd or '')
447 host = socket.gethostbyname(host)
448 if not port:
449 import ftplib
450 port = ftplib.FTP_PORT
451 else:
452 port = int(port)
453 path, attrs = splitattr(path)
454 path = unquote(path)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000455 dirs = path.split('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000456 dirs, file = dirs[:-1], dirs[-1]
457 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000458 if dirs and not dirs[0]: dirs[0] = '/'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000459 key = user, host, port, '/'.join(dirs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000460 # XXX thread unsafe!
461 if len(self.ftpcache) > MAXFTPCACHE:
462 # Prune the cache, rather arbitrarily
463 for k in self.ftpcache.keys():
464 if k != key:
465 v = self.ftpcache[k]
466 del self.ftpcache[k]
467 v.close()
468 try:
469 if not self.ftpcache.has_key(key):
470 self.ftpcache[key] = \
471 ftpwrapper(user, passwd, host, port, dirs)
472 if not file: type = 'D'
473 else: type = 'I'
474 for attr in attrs:
475 attr, value = splitvalue(attr)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000476 if attr.lower() == 'type' and \
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000477 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000478 type = value.upper()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000479 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000480 mtype = mimetypes.guess_type("ftp:" + url)[0]
481 headers = ""
482 if mtype:
483 headers += "Content-Type: %s\n" % mtype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000484 if retrlen is not None and retrlen >= 0:
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000485 headers += "Content-Length: %d\n" % retrlen
486 headers = mimetools.Message(StringIO.StringIO(headers))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000487 return addinfourl(fp, headers, "ftp:" + url)
488 except ftperrors(), msg:
489 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000490
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000491 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000492 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000493 # ignore POSTed data
494 #
495 # syntax of data URLs:
496 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
497 # mediatype := [ type "/" subtype ] *( ";" parameter )
498 # data := *urlchar
499 # parameter := attribute "=" value
500 import StringIO, mimetools, time
501 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000502 [type, data] = url.split(',', 1)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000503 except ValueError:
504 raise IOError, ('data error', 'bad data URL')
505 if not type:
506 type = 'text/plain;charset=US-ASCII'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000507 semi = type.rfind(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000508 if semi >= 0 and '=' not in type[semi:]:
509 encoding = type[semi+1:]
510 type = type[:semi]
511 else:
512 encoding = ''
513 msg = []
514 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
515 time.gmtime(time.time())))
516 msg.append('Content-type: %s' % type)
517 if encoding == 'base64':
518 import base64
519 data = base64.decodestring(data)
520 else:
521 data = unquote(data)
522 msg.append('Content-length: %d' % len(data))
523 msg.append('')
524 msg.append(data)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000525 msg = '\n'.join(msg)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000526 f = StringIO.StringIO(msg)
527 headers = mimetools.Message(f, 0)
528 f.fileno = None # needed for addinfourl
529 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000530
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000531
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000532class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000533 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000534
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000535 def __init__(self, *args):
536 apply(URLopener.__init__, (self,) + args)
537 self.auth_cache = {}
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000538 self.tries = 0
539 self.maxtries = 10
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000540
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000541 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000542 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000543 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000544
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000545 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000546 """Error 302 -- relocated (temporarily)."""
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000547 self.tries += 1
548 if self.maxtries and self.tries >= self.maxtries:
549 if hasattr(self, "http_error_500"):
550 meth = self.http_error_500
551 else:
552 meth = self.http_error_default
553 self.tries = 0
554 return meth(url, fp, 500,
555 "Internal Server Error: Redirect Recursion", headers)
556 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
557 data)
558 self.tries = 0
559 return result
560
561 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000562 if headers.has_key('location'):
563 newurl = headers['location']
564 elif headers.has_key('uri'):
565 newurl = headers['uri']
566 else:
567 return
568 void = fp.read()
569 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000570 # In case the server sent a relative URL, join with original:
Moshe Zadka5d87d472001-04-09 14:54:21 +0000571 newurl = basejoin(self.type + ":" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000572 if data is None:
573 return self.open(newurl)
574 else:
575 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000576
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000577 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000578 """Error 301 -- also relocated (permanently)."""
579 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000580
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000581 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000582 """Error 401 -- authentication required.
583 See this URL for a description of the basic authentication scheme:
584 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Moshe Zadkae99bd172001-02-27 06:27:04 +0000585 if not headers.has_key('www-authenticate'):
Tim Peters85ba6732001-02-28 08:26:44 +0000586 URLopener.http_error_default(self, url, fp,
Fred Drakec680ae82001-10-13 18:37:07 +0000587 errcode, errmsg, headers)
Moshe Zadkae99bd172001-02-27 06:27:04 +0000588 stuff = headers['www-authenticate']
589 import re
590 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
591 if not match:
Tim Peters85ba6732001-02-28 08:26:44 +0000592 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000593 errcode, errmsg, headers)
594 scheme, realm = match.groups()
595 if scheme.lower() != 'basic':
Tim Peters85ba6732001-02-28 08:26:44 +0000596 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000597 errcode, errmsg, headers)
598 name = 'retry_' + self.type + '_basic_auth'
599 if data is None:
600 return getattr(self,name)(url, realm)
601 else:
602 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000603
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000604 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000605 host, selector = splithost(url)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000606 i = host.find('@') + 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000607 host = host[i:]
608 user, passwd = self.get_user_passwd(host, realm, i)
609 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000610 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000611 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000612 if data is None:
613 return self.open(newurl)
614 else:
615 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000616
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000617 def retry_https_basic_auth(self, url, realm, data=None):
Tim Peterse1190062001-01-15 03:34:38 +0000618 host, selector = splithost(url)
619 i = host.find('@') + 1
620 host = host[i:]
621 user, passwd = self.get_user_passwd(host, realm, i)
622 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000623 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Tim Peterse1190062001-01-15 03:34:38 +0000624 newurl = '//' + host + selector
Guido van Rossumafc4f042001-01-15 18:31:13 +0000625 return self.open_https(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000626
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000627 def get_user_passwd(self, host, realm, clear_cache = 0):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000628 key = realm + '@' + host.lower()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000629 if self.auth_cache.has_key(key):
630 if clear_cache:
631 del self.auth_cache[key]
632 else:
633 return self.auth_cache[key]
634 user, passwd = self.prompt_user_passwd(host, realm)
635 if user or passwd: self.auth_cache[key] = (user, passwd)
636 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000637
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000638 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000639 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000640 import getpass
641 try:
642 user = raw_input("Enter username for %s at %s: " % (realm,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000643 host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000644 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
645 (user, realm, host))
646 return user, passwd
647 except KeyboardInterrupt:
648 print
649 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000650
651
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000652# Utility functions
653
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000654_localhost = None
655def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000656 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000657 global _localhost
658 if not _localhost:
659 _localhost = socket.gethostbyname('localhost')
660 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000661
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000662_thishost = None
663def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000664 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000665 global _thishost
666 if not _thishost:
667 _thishost = socket.gethostbyname(socket.gethostname())
668 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000669
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000670_ftperrors = None
671def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000672 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000673 global _ftperrors
674 if not _ftperrors:
675 import ftplib
676 _ftperrors = ftplib.all_errors
677 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000678
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000679_noheaders = None
680def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000681 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000682 global _noheaders
683 if not _noheaders:
684 import mimetools
685 import StringIO
686 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
687 _noheaders.fp.close() # Recycle file descriptor
688 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000689
690
691# Utility classes
692
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000693class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000694 """Class used by open_ftp() for cache of open FTP connections."""
695
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000696 def __init__(self, user, passwd, host, port, dirs):
697 self.user = user
698 self.passwd = passwd
699 self.host = host
700 self.port = port
701 self.dirs = dirs
702 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000703
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000704 def init(self):
705 import ftplib
706 self.busy = 0
707 self.ftp = ftplib.FTP()
708 self.ftp.connect(self.host, self.port)
709 self.ftp.login(self.user, self.passwd)
710 for dir in self.dirs:
711 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000712
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000713 def retrfile(self, file, type):
714 import ftplib
715 self.endtransfer()
716 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
717 else: cmd = 'TYPE ' + type; isdir = 0
718 try:
719 self.ftp.voidcmd(cmd)
720 except ftplib.all_errors:
721 self.init()
722 self.ftp.voidcmd(cmd)
723 conn = None
724 if file and not isdir:
725 # Use nlst to see if the file exists at all
726 try:
727 self.ftp.nlst(file)
728 except ftplib.error_perm, reason:
729 raise IOError, ('ftp error', reason), sys.exc_info()[2]
730 # Restore the transfer mode!
731 self.ftp.voidcmd(cmd)
732 # Try to retrieve as a file
733 try:
734 cmd = 'RETR ' + file
735 conn = self.ftp.ntransfercmd(cmd)
736 except ftplib.error_perm, reason:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000737 if str(reason)[:3] != '550':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000738 raise IOError, ('ftp error', reason), sys.exc_info()[2]
739 if not conn:
740 # Set transfer mode to ASCII!
741 self.ftp.voidcmd('TYPE A')
742 # Try a directory listing
743 if file: cmd = 'LIST ' + file
744 else: cmd = 'LIST'
745 conn = self.ftp.ntransfercmd(cmd)
746 self.busy = 1
747 # Pass back both a suitably decorated object and a retrieval length
748 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000749 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000750 def endtransfer(self):
751 if not self.busy:
752 return
753 self.busy = 0
754 try:
755 self.ftp.voidresp()
756 except ftperrors():
757 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000758
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000759 def close(self):
760 self.endtransfer()
761 try:
762 self.ftp.close()
763 except ftperrors():
764 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000765
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000766class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000767 """Base class for addinfo and addclosehook."""
768
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000769 def __init__(self, fp):
770 self.fp = fp
771 self.read = self.fp.read
772 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000773 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
774 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000775
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000776 def __repr__(self):
777 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000778 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000779
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000780 def close(self):
781 self.read = None
782 self.readline = None
783 self.readlines = None
784 self.fileno = None
785 if self.fp: self.fp.close()
786 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000787
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000788class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000789 """Class to add a close hook to an open file."""
790
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000791 def __init__(self, fp, closehook, *hookargs):
792 addbase.__init__(self, fp)
793 self.closehook = closehook
794 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000795
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000796 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000797 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000798 if self.closehook:
799 apply(self.closehook, self.hookargs)
800 self.closehook = None
801 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000802
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000803class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000804 """class to add an info() method to an open file."""
805
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000806 def __init__(self, fp, headers):
807 addbase.__init__(self, fp)
808 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000809
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000810 def info(self):
811 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000812
Guido van Rossume6ad8911996-09-10 17:02:56 +0000813class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000814 """class to add info() and geturl() methods to an open file."""
815
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000816 def __init__(self, fp, headers, url):
817 addbase.__init__(self, fp)
818 self.headers = headers
819 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000820
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000821 def info(self):
822 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000823
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000824 def geturl(self):
825 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000826
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000827
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000828def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000829 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000830 type, path = splittype(url)
831 if type:
832 # if url is complete (i.e., it contains a type), return it
833 return url
834 host, path = splithost(path)
835 type, basepath = splittype(base) # inherit type from base
836 if host:
837 # if url contains host, just inherit type
838 if type: return type + '://' + host + path
839 else:
840 # no type inherited, so url must have started with //
841 # just return it
842 return url
843 host, basepath = splithost(basepath) # inherit host
Thomas Wouters7e474022000-07-16 12:04:32 +0000844 basepath, basetag = splittag(basepath) # remove extraneous cruft
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000845 basepath, basequery = splitquery(basepath) # idem
846 if path[:1] != '/':
847 # non-absolute path name
848 if path[:1] in ('#', '?'):
849 # path is just a tag or query, attach to basepath
850 i = len(basepath)
851 else:
852 # else replace last component
Guido van Rossumb2493f82000-12-15 15:01:37 +0000853 i = basepath.rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000854 if i < 0:
855 # basepath not absolute
856 if host:
857 # host present, make absolute
858 basepath = '/'
859 else:
860 # else keep non-absolute
861 basepath = ''
862 else:
863 # remove last file component
864 basepath = basepath[:i+1]
865 # Interpret ../ (important because of symlinks)
866 while basepath and path[:3] == '../':
867 path = path[3:]
Guido van Rossumb2493f82000-12-15 15:01:37 +0000868 i = basepath[:-1].rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000869 if i > 0:
870 basepath = basepath[:i+1]
871 elif i == 0:
872 basepath = '/'
873 break
874 else:
875 basepath = ''
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000876
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000877 path = basepath + path
Guido van Rossumb8bf3be2001-04-15 20:47:33 +0000878 if host and path and path[0] != '/':
879 path = '/' + path
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000880 if type and host: return type + '://' + host + path
881 elif type: return type + ':' + path
882 elif host: return '//' + host + path # don't know what this means
883 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000884
885
Guido van Rossum7c395db1994-07-04 22:14:49 +0000886# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000887# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000888# splittype('type:opaquestring') --> 'type', 'opaquestring'
889# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000890# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
891# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000892# splitport('host:port') --> 'host', 'port'
893# splitquery('/path?query') --> '/path', 'query'
894# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000895# splitattr('/path;attr1=value1;attr2=value2;...') ->
896# '/path', ['attr1=value1', 'attr2=value2', ...]
897# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000898# splitgophertype('/Xselector') --> 'X', 'selector'
899# unquote('abc%20def') -> 'abc def'
900# quote('abc def') -> 'abc%20def')
901
Martin v. Löwis1d994332000-12-03 18:30:10 +0000902def toBytes(url):
903 """toBytes(u"URL") --> 'URL'."""
904 # Most URL schemes require ASCII. If that changes, the conversion
905 # can be relaxed
906 if type(url) is types.UnicodeType:
907 try:
908 url = url.encode("ASCII")
909 except UnicodeError:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000910 raise UnicodeError("URL " + repr(url) +
911 " contains non-ASCII characters")
Martin v. Löwis1d994332000-12-03 18:30:10 +0000912 return url
913
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000914def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000915 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Guido van Rossumb2493f82000-12-15 15:01:37 +0000916 url = url.strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000917 if url[:1] == '<' and url[-1:] == '>':
Guido van Rossumb2493f82000-12-15 15:01:37 +0000918 url = url[1:-1].strip()
919 if url[:4] == 'URL:': url = url[4:].strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000920 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000921
Guido van Rossum332e1441997-09-29 23:23:46 +0000922_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000923def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000924 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000925 global _typeprog
926 if _typeprog is None:
927 import re
928 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000929
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000930 match = _typeprog.match(url)
931 if match:
932 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000933 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000934 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000935
Guido van Rossum332e1441997-09-29 23:23:46 +0000936_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000937def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000938 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000939 global _hostprog
940 if _hostprog is None:
941 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000942 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000943
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000944 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000945 if match: return match.group(1, 2)
946 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000947
Guido van Rossum332e1441997-09-29 23:23:46 +0000948_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000949def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000950 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000951 global _userprog
952 if _userprog is None:
953 import re
954 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000955
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000956 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000957 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000958 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000959
Guido van Rossum332e1441997-09-29 23:23:46 +0000960_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000961def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000962 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000963 global _passwdprog
964 if _passwdprog is None:
965 import re
966 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000967
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000968 match = _passwdprog.match(user)
969 if match: return match.group(1, 2)
970 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000971
Guido van Rossume7b146f2000-02-04 15:28:42 +0000972# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000973_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000974def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000975 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000976 global _portprog
977 if _portprog is None:
978 import re
979 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000980
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000981 match = _portprog.match(host)
982 if match: return match.group(1, 2)
983 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000984
Guido van Rossum332e1441997-09-29 23:23:46 +0000985_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000986def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000987 """Split host and port, returning numeric port.
988 Return given default port if no ':' found; defaults to -1.
989 Return numerical port if a valid number are found after ':'.
990 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000991 global _nportprog
992 if _nportprog is None:
993 import re
994 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000995
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000996 match = _nportprog.match(host)
997 if match:
998 host, port = match.group(1, 2)
999 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001000 if not port: raise ValueError, "no digits"
1001 nport = int(port)
1002 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001003 nport = None
1004 return host, nport
1005 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +00001006
Guido van Rossum332e1441997-09-29 23:23:46 +00001007_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001008def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001009 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001010 global _queryprog
1011 if _queryprog is None:
1012 import re
1013 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001014
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001015 match = _queryprog.match(url)
1016 if match: return match.group(1, 2)
1017 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001018
Guido van Rossum332e1441997-09-29 23:23:46 +00001019_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001020def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001021 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001022 global _tagprog
1023 if _tagprog is None:
1024 import re
1025 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001026
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001027 match = _tagprog.match(url)
1028 if match: return match.group(1, 2)
1029 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001030
Guido van Rossum7c395db1994-07-04 22:14:49 +00001031def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001032 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1033 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Guido van Rossumb2493f82000-12-15 15:01:37 +00001034 words = url.split(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001035 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +00001036
Guido van Rossum332e1441997-09-29 23:23:46 +00001037_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001038def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001039 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001040 global _valueprog
1041 if _valueprog is None:
1042 import re
1043 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001044
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001045 match = _valueprog.match(attr)
1046 if match: return match.group(1, 2)
1047 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001048
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001049def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001050 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001051 if selector[:1] == '/' and selector[1:2]:
1052 return selector[1], selector[2:]
1053 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001054
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001055def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001056 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001057 mychr = chr
Guido van Rossumb2493f82000-12-15 15:01:37 +00001058 myatoi = int
1059 list = s.split('%')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001060 res = [list[0]]
1061 myappend = res.append
1062 del list[0]
1063 for item in list:
1064 if item[1:2]:
1065 try:
1066 myappend(mychr(myatoi(item[:2], 16))
1067 + item[2:])
Martin v. Löwis58682b72001-08-11 15:02:57 +00001068 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001069 myappend('%' + item)
1070 else:
1071 myappend('%' + item)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001072 return "".join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001073
Guido van Rossum0564e121996-12-13 14:47:36 +00001074def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001075 """unquote('%7e/abc+def') -> '~/abc def'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001076 if '+' in s:
1077 # replace '+' with ' '
Guido van Rossumb2493f82000-12-15 15:01:37 +00001078 s = ' '.join(s.split('+'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001079 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001080
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001081always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001082 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001083 '0123456789' '_.-')
1084
1085_fast_safe_test = always_safe + '/'
1086_fast_safe = None
1087
1088def _fast_quote(s):
1089 global _fast_safe
1090 if _fast_safe is None:
1091 _fast_safe = {}
1092 for c in _fast_safe_test:
1093 _fast_safe[c] = c
1094 res = list(s)
1095 for i in range(len(res)):
1096 c = res[i]
1097 if not _fast_safe.has_key(c):
Guido van Rossume27a7b82001-01-19 03:28:15 +00001098 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001099 return ''.join(res)
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001100
Guido van Rossum7c395db1994-07-04 22:14:49 +00001101def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001102 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001103
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001104 Each part of a URL, e.g. the path info, the query, etc., has a
1105 different set of reserved characters that must be quoted.
1106
1107 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1108 the following reserved characters.
1109
1110 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1111 "$" | ","
1112
1113 Each of these characters is reserved in some component of a URL,
1114 but not necessarily in all of them.
1115
1116 By default, the quote function is intended for quoting the path
1117 section of a URL. Thus, it will not encode '/'. This character
1118 is reserved, but in typical usage the quote function is being
1119 called on a path where the existing slash characters are used as
1120 reserved characters.
1121 """
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001122 safe = always_safe + safe
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001123 if _fast_safe_test == safe:
1124 return _fast_quote(s)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001125 res = list(s)
1126 for i in range(len(res)):
1127 c = res[i]
1128 if c not in safe:
Guido van Rossume27a7b82001-01-19 03:28:15 +00001129 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001130 return ''.join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001131
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001132def quote_plus(s, safe = ''):
1133 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001134 if ' ' in s:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001135 l = s.split(' ')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001136 for i in range(len(l)):
1137 l[i] = quote(l[i], safe)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001138 return '+'.join(l)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001139 else:
1140 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001141
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001142def urlencode(query,doseq=0):
1143 """Encode a sequence of two-element tuples or dictionary into a URL query string.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001144
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001145 If any values in the query arg are sequences and doseq is true, each
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001146 sequence element is converted to a separate parameter.
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001147
1148 If the query arg is a sequence of two-element tuples, the order of the
1149 parameters in the output will match the order of parameters in the
1150 input.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001151 """
Tim Peters658cba62001-02-09 20:06:00 +00001152
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001153 if hasattr(query,"items"):
1154 # mapping objects
1155 query = query.items()
1156 else:
1157 # it's a bother at times that strings and string-like objects are
1158 # sequences...
1159 try:
1160 # non-sequence items should not work with len()
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001161 # non-empty strings will fail this
1162 if len(query) and type(query[0]) != types.TupleType:
1163 raise TypeError
1164 # zero-length sequences of all types will get here and succeed,
1165 # but that's a minor nit - since the original implementation
1166 # allowed empty dicts that type of behavior probably should be
1167 # preserved for consistency
1168 except TypeError:
1169 ty,va,tb = sys.exc_info()
1170 raise TypeError, "not a valid non-string sequence or mapping object", tb
1171
Guido van Rossume7b146f2000-02-04 15:28:42 +00001172 l = []
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001173 if not doseq:
1174 # preserve old behavior
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001175 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001176 k = quote_plus(str(k))
1177 v = quote_plus(str(v))
1178 l.append(k + '=' + v)
1179 else:
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001180 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001181 k = quote_plus(str(k))
1182 if type(v) == types.StringType:
1183 v = quote_plus(v)
1184 l.append(k + '=' + v)
1185 elif type(v) == types.UnicodeType:
1186 # is there a reasonable way to convert to ASCII?
1187 # encode generates a string, but "replace" or "ignore"
1188 # lose information and "strict" can raise UnicodeError
1189 v = quote_plus(v.encode("ASCII","replace"))
1190 l.append(k + '=' + v)
1191 else:
1192 try:
1193 # is this a sufficient test for sequence-ness?
1194 x = len(v)
1195 except TypeError:
1196 # not a sequence
1197 v = quote_plus(str(v))
1198 l.append(k + '=' + v)
1199 else:
1200 # loop over the sequence
1201 for elt in v:
1202 l.append(k + '=' + quote_plus(str(elt)))
Guido van Rossumb2493f82000-12-15 15:01:37 +00001203 return '&'.join(l)
Guido van Rossum810a3391998-07-22 21:33:23 +00001204
Guido van Rossum442e7201996-03-20 15:33:11 +00001205# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001206def getproxies_environment():
1207 """Return a dictionary of scheme -> proxy server URL mappings.
1208
1209 Scan the environment for variables named <scheme>_proxy;
1210 this seems to be the standard convention. If you need a
1211 different way, you can pass a proxies dictionary to the
1212 [Fancy]URLopener constructor.
1213
1214 """
1215 proxies = {}
1216 for name, value in os.environ.items():
Guido van Rossumb2493f82000-12-15 15:01:37 +00001217 name = name.lower()
Mark Hammond4f570b92000-07-26 07:04:38 +00001218 if value and name[-6:] == '_proxy':
1219 proxies[name[:-6]] = value
1220 return proxies
1221
Guido van Rossum4163e701998-08-06 13:39:09 +00001222if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001223 def getproxies():
1224 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001225
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001226 By convention the mac uses Internet Config to store
1227 proxies. An HTTP proxy, for instance, is stored under
1228 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001229
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001230 """
1231 try:
1232 import ic
1233 except ImportError:
1234 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001235
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001236 try:
1237 config = ic.IC()
1238 except ic.error:
1239 return {}
1240 proxies = {}
1241 # HTTP:
1242 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1243 try:
1244 value = config['HTTPProxyHost']
1245 except ic.error:
1246 pass
1247 else:
1248 proxies['http'] = 'http://%s' % value
1249 # FTP: XXXX To be done.
1250 # Gopher: XXXX To be done.
1251 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001252
Tim Peters55c12d42001-08-09 18:04:14 +00001253 def proxy_bypass(x):
1254 return 0
1255
Mark Hammond4f570b92000-07-26 07:04:38 +00001256elif os.name == 'nt':
1257 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001258 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001259
1260 Win32 uses the registry to store proxies.
1261
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001262 """
1263 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001264 try:
1265 import _winreg
1266 except ImportError:
1267 # Std module, so should be around - but you never know!
1268 return proxies
1269 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001270 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1271 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001272 proxyEnable = _winreg.QueryValueEx(internetSettings,
1273 'ProxyEnable')[0]
1274 if proxyEnable:
1275 # Returned as Unicode but problems if not converted to ASCII
1276 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1277 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001278 if '=' in proxyServer:
1279 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001280 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001281 protocol, address = p.split('=', 1)
Mark Hammond4f570b92000-07-26 07:04:38 +00001282 proxies[protocol] = '%s://%s' % (protocol, address)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001283 else:
1284 # Use one setting for all protocols
1285 if proxyServer[:5] == 'http:':
1286 proxies['http'] = proxyServer
1287 else:
1288 proxies['http'] = 'http://%s' % proxyServer
1289 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001290 internetSettings.Close()
1291 except (WindowsError, ValueError, TypeError):
1292 # Either registry key not found etc, or the value in an
1293 # unexpected format.
1294 # proxies already set up to be empty so nothing to do
1295 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001296 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001297
Mark Hammond4f570b92000-07-26 07:04:38 +00001298 def getproxies():
1299 """Return a dictionary of scheme -> proxy server URL mappings.
1300
1301 Returns settings gathered from the environment, if specified,
1302 or the registry.
1303
1304 """
1305 return getproxies_environment() or getproxies_registry()
Tim Peters55c12d42001-08-09 18:04:14 +00001306
1307 def proxy_bypass(host):
1308 try:
1309 import _winreg
1310 import re
1311 import socket
1312 except ImportError:
1313 # Std modules, so should be around - but you never know!
1314 return 0
1315 try:
1316 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1317 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1318 proxyEnable = _winreg.QueryValueEx(internetSettings,
1319 'ProxyEnable')[0]
1320 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1321 'ProxyOverride')[0])
1322 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1323 except WindowsError:
1324 return 0
1325 if not proxyEnable or not proxyOverride:
1326 return 0
1327 # try to make a host list from name and IP address.
1328 host = [host]
1329 try:
1330 addr = socket.gethostbyname(host[0])
1331 if addr != host:
1332 host.append(addr)
1333 except socket.error:
1334 pass
1335 # make a check value list from the registry entry: replace the
1336 # '<local>' string by the localhost entry and the corresponding
1337 # canonical entry.
1338 proxyOverride = proxyOverride.split(';')
1339 i = 0
1340 while i < len(proxyOverride):
1341 if proxyOverride[i] == '<local>':
1342 proxyOverride[i:i+1] = ['localhost',
1343 '127.0.0.1',
1344 socket.gethostname(),
1345 socket.gethostbyname(
1346 socket.gethostname())]
1347 i += 1
1348 # print proxyOverride
1349 # now check if we match one of the registry values.
1350 for test in proxyOverride:
Tim Petersab9ba272001-08-09 21:40:30 +00001351 test = test.replace(".", r"\.") # mask dots
1352 test = test.replace("*", r".*") # change glob sequence
1353 test = test.replace("?", r".") # change glob char
Tim Peters55c12d42001-08-09 18:04:14 +00001354 for val in host:
1355 # print "%s <--> %s" %( test, val )
1356 if re.match(test, val, re.I):
1357 return 1
1358 return 0
1359
Mark Hammond4f570b92000-07-26 07:04:38 +00001360else:
1361 # By default use environment variables
1362 getproxies = getproxies_environment
1363
Tim Peters55c12d42001-08-09 18:04:14 +00001364 def proxy_bypass(host):
1365 return 0
Guido van Rossum442e7201996-03-20 15:33:11 +00001366
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001367# Test and time quote() and unquote()
1368def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001369 import time
1370 s = ''
1371 for i in range(256): s = s + chr(i)
1372 s = s*4
1373 t0 = time.time()
1374 qs = quote(s)
1375 uqs = unquote(qs)
1376 t1 = time.time()
1377 if uqs != s:
1378 print 'Wrong!'
1379 print `s`
1380 print `qs`
1381 print `uqs`
1382 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001383
1384
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001385def reporthook(blocknum, blocksize, totalsize):
1386 # Report during remote transfers
Guido van Rossumb2493f82000-12-15 15:01:37 +00001387 print "Block number: %d, Block size: %d, Total size: %d" % (
1388 blocknum, blocksize, totalsize)
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001389
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001390# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001391def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001392 if not args:
1393 args = [
1394 '/etc/passwd',
1395 'file:/etc/passwd',
1396 'file://localhost/etc/passwd',
1397 'ftp://ftp.python.org/etc/passwd',
1398## 'gopher://gopher.micro.umn.edu/1/',
1399 'http://www.python.org/index.html',
1400 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001401 if hasattr(URLopener, "open_https"):
1402 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001403 try:
1404 for url in args:
1405 print '-'*10, url, '-'*10
1406 fn, h = urlretrieve(url, None, reporthook)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001407 print fn
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001408 if h:
1409 print '======'
1410 for k in h.keys(): print k + ':', h[k]
1411 print '======'
1412 fp = open(fn, 'rb')
1413 data = fp.read()
1414 del fp
1415 if '\r' in data:
1416 table = string.maketrans("", "")
Guido van Rossumb2493f82000-12-15 15:01:37 +00001417 data = data.translate(table, "\r")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001418 print data
1419 fn, h = None, None
1420 print '-'*40
1421 finally:
1422 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001423
Guido van Rossum23490151998-06-25 02:39:00 +00001424def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001425 import getopt, sys
1426 try:
1427 opts, args = getopt.getopt(sys.argv[1:], "th")
1428 except getopt.error, msg:
1429 print msg
1430 print "Use -h for help"
1431 return
1432 t = 0
1433 for o, a in opts:
1434 if o == '-t':
1435 t = t + 1
1436 if o == '-h':
1437 print "Usage: python urllib.py [-t] [url ...]"
1438 print "-t runs self-test;",
1439 print "otherwise, contents of urls are printed"
1440 return
1441 if t:
1442 if t > 1:
1443 test1()
1444 test(args)
1445 else:
1446 if not args:
1447 print "Use -h for help"
1448 for url in args:
1449 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001450
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001451# Run test program when run as a script
1452if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001453 main()