blob: 2ba55908778a6770a926998c2b8956e0cee54c8a [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossumf0713d32001-08-09 17:43:35 +000028import time
Guido van Rossum3c8484e1996-11-20 22:02:24 +000029import sys
Martin v. Löwis1d994332000-12-03 18:30:10 +000030import types
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000031
Skip Montanaro40fc1602001-03-01 04:27:19 +000032__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
Skip Montanaro44d5e0c2001-03-13 19:47:16 +000034 "urlencode", "url2pathname", "pathname2url", "splittag",
35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37 "splitnport", "splitquery", "splitattr", "splitvalue",
38 "splitgophertype", "getproxies"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000039
Guido van Rossumb2493f82000-12-15 15:01:37 +000040__version__ = '1.15' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000041
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000042MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000043
Jack Jansendc3e3f61995-12-15 13:22:13 +000044# Helper for non-unix systems
45if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000046 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000047elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000048 from nturl2path import url2pathname, pathname2url
Guido van Rossumd74fb6b2001-03-02 06:43:49 +000049elif os.name == 'riscos':
50 from rourl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000051else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000052 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000053 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000054 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000055 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000056
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000057# This really consists of two pieces:
58# (1) a class which handles opening of all sorts of URLs
59# (plus assorted utilities etc.)
60# (2) a set of functions for parsing URLs
61# XXX Should these be separated out into different modules?
62
63
64# Shortcut for basic usage
65_urlopener = None
Guido van Rossumbd013741996-12-10 16:00:28 +000066def urlopen(url, data=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000067 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000068 global _urlopener
69 if not _urlopener:
70 _urlopener = FancyURLopener()
71 if data is None:
72 return _urlopener.open(url)
73 else:
74 return _urlopener.open(url, data)
Fred Drake316a7932000-08-24 01:01:26 +000075def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000076 global _urlopener
77 if not _urlopener:
78 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000079 return _urlopener.retrieve(url, filename, reporthook, data)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000080def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000081 if _urlopener:
82 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000083
84
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000085ftpcache = {}
86class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000087 """Class to open URLs.
88 This is a class rather than just a subroutine because we may need
89 more than one set of global protocol-specific options.
90 Note -- this is a base class for those who don't want the
91 automatic handling of errors type 302 (relocated) and 401
92 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000093
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000094 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000095
Guido van Rossumba311382000-08-24 16:18:04 +000096 version = "Python-urllib/%s" % __version__
97
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000098 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000099 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000100 if proxies is None:
101 proxies = getproxies()
102 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
103 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000104 self.key_file = x509.get('key_file')
105 self.cert_file = x509.get('cert_file')
Guido van Rossumba311382000-08-24 16:18:04 +0000106 self.addheaders = [('User-agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000107 self.__tempfiles = []
108 self.__unlink = os.unlink # See cleanup()
109 self.tempcache = None
110 # Undocumented feature: if you assign {} to tempcache,
111 # it is used to cache files retrieved with
112 # self.retrieve(). This is not enabled by default
113 # since it does not work for changing documents (and I
114 # haven't got the logic to check expiration headers
115 # yet).
116 self.ftpcache = ftpcache
117 # Undocumented feature: you can use a different
118 # ftp cache by assigning to the .ftpcache member;
119 # in case you want logically independent URL openers
120 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000121
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000122 def __del__(self):
123 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000124
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000125 def close(self):
126 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000127
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000128 def cleanup(self):
129 # This code sometimes runs when the rest of this module
130 # has already been deleted, so it can't use any globals
131 # or import anything.
132 if self.__tempfiles:
133 for file in self.__tempfiles:
134 try:
135 self.__unlink(file)
Martin v. Löwis58682b72001-08-11 15:02:57 +0000136 except OSError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000137 pass
138 del self.__tempfiles[:]
139 if self.tempcache:
140 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000141
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000142 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000143 """Add a header to be used by the HTTP interface only
144 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000145 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000146
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000147 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000148 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000149 """Use URLopener().open(file) instead of open(file, 'r')."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000150 fullurl = unwrap(toBytes(fullurl))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000151 if self.tempcache and self.tempcache.has_key(fullurl):
152 filename, headers = self.tempcache[fullurl]
153 fp = open(filename, 'rb')
154 return addinfourl(fp, headers, fullurl)
Martin v. Löwis1d994332000-12-03 18:30:10 +0000155 urltype, url = splittype(fullurl)
156 if not urltype:
157 urltype = 'file'
158 if self.proxies.has_key(urltype):
159 proxy = self.proxies[urltype]
160 urltype, proxyhost = splittype(proxy)
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000161 host, selector = splithost(proxyhost)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000162 url = (host, fullurl) # Signal special case to open_*()
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000163 else:
164 proxy = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000165 name = 'open_' + urltype
166 self.type = urltype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000167 if '-' in name:
168 # replace - with _
Guido van Rossumb2493f82000-12-15 15:01:37 +0000169 name = '_'.join(name.split('-'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000170 if not hasattr(self, name):
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000171 if proxy:
172 return self.open_unknown_proxy(proxy, fullurl, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000173 else:
174 return self.open_unknown(fullurl, data)
175 try:
176 if data is None:
177 return getattr(self, name)(url)
178 else:
179 return getattr(self, name)(url, data)
180 except socket.error, msg:
181 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000182
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000183 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000184 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000185 type, url = splittype(fullurl)
186 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000187
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000188 def open_unknown_proxy(self, proxy, fullurl, data=None):
189 """Overridable interface to open unknown URL type."""
190 type, url = splittype(fullurl)
191 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
192
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000193 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000194 def retrieve(self, url, filename=None, reporthook=None, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000195 """retrieve(url) returns (filename, None) for a local object
196 or (tempfilename, headers) for a remote object."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000197 url = unwrap(toBytes(url))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000198 if self.tempcache and self.tempcache.has_key(url):
199 return self.tempcache[url]
200 type, url1 = splittype(url)
201 if not filename and (not type or type == 'file'):
202 try:
203 fp = self.open_local_file(url1)
204 hdrs = fp.info()
205 del fp
206 return url2pathname(splithost(url1)[1]), hdrs
207 except IOError, msg:
208 pass
Fred Drake316a7932000-08-24 01:01:26 +0000209 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000210 headers = fp.info()
211 if not filename:
212 import tempfile
213 garbage, path = splittype(url)
214 garbage, path = splithost(path or "")
215 path, garbage = splitquery(path or "")
216 path, garbage = splitattr(path or "")
217 suffix = os.path.splitext(path)[1]
218 filename = tempfile.mktemp(suffix)
219 self.__tempfiles.append(filename)
220 result = filename, headers
221 if self.tempcache is not None:
222 self.tempcache[url] = result
223 tfp = open(filename, 'wb')
224 bs = 1024*8
225 size = -1
226 blocknum = 1
227 if reporthook:
228 if headers.has_key("content-length"):
229 size = int(headers["Content-Length"])
230 reporthook(0, bs, size)
231 block = fp.read(bs)
232 if reporthook:
233 reporthook(1, bs, size)
234 while block:
235 tfp.write(block)
236 block = fp.read(bs)
237 blocknum = blocknum + 1
238 if reporthook:
239 reporthook(blocknum, bs, size)
240 fp.close()
241 tfp.close()
242 del fp
243 del tfp
244 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000245
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000246 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000247
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000248 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000249 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000250 import httplib
251 user_passwd = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000252 if type(url) is types.StringType:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000253 host, selector = splithost(url)
254 if host:
255 user_passwd, host = splituser(host)
256 host = unquote(host)
257 realhost = host
258 else:
259 host, selector = url
260 urltype, rest = splittype(selector)
261 url = rest
262 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000263 if urltype.lower() != 'http':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000264 realhost = None
265 else:
266 realhost, rest = splithost(rest)
267 if realhost:
268 user_passwd, realhost = splituser(realhost)
269 if user_passwd:
270 selector = "%s://%s%s" % (urltype, realhost, rest)
Tim Peters55c12d42001-08-09 18:04:14 +0000271 if proxy_bypass(realhost):
272 host = realhost
273
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000274 #print "proxy via http:", host, selector
275 if not host: raise IOError, ('http error', 'no host given')
276 if user_passwd:
277 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000278 auth = base64.encodestring(user_passwd).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000279 else:
280 auth = None
281 h = httplib.HTTP(host)
282 if data is not None:
283 h.putrequest('POST', selector)
284 h.putheader('Content-type', 'application/x-www-form-urlencoded')
285 h.putheader('Content-length', '%d' % len(data))
286 else:
287 h.putrequest('GET', selector)
288 if auth: h.putheader('Authorization', 'Basic %s' % auth)
289 if realhost: h.putheader('Host', realhost)
290 for args in self.addheaders: apply(h.putheader, args)
291 h.endheaders()
292 if data is not None:
Fred Drakeec3dfde2001-07-04 05:18:29 +0000293 h.send(data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000294 errcode, errmsg, headers = h.getreply()
295 fp = h.getfile()
296 if errcode == 200:
297 return addinfourl(fp, headers, "http:" + url)
298 else:
299 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000300 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000301 else:
302 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000303
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000304 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000305 """Handle http errors.
306 Derived class can override this, or provide specific handlers
307 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000308 # First check if there's a specific handler for this error
309 name = 'http_error_%d' % errcode
310 if hasattr(self, name):
311 method = getattr(self, name)
312 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000313 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000314 else:
315 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000316 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000317 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000318
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000319 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000320 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000321 void = fp.read()
322 fp.close()
323 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000324
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000325 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000326 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000327 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000328 import httplib
Fred Drake567ca8e2000-08-21 21:42:42 +0000329 user_passwd = None
Moshe Zadkab2a0a832001-01-08 07:09:25 +0000330 if type(url) is types.StringType:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000331 host, selector = splithost(url)
Fred Drake567ca8e2000-08-21 21:42:42 +0000332 if host:
333 user_passwd, host = splituser(host)
334 host = unquote(host)
335 realhost = host
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000336 else:
337 host, selector = url
338 urltype, rest = splittype(selector)
Fred Drake567ca8e2000-08-21 21:42:42 +0000339 url = rest
340 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000341 if urltype.lower() != 'https':
Fred Drake567ca8e2000-08-21 21:42:42 +0000342 realhost = None
343 else:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000344 realhost, rest = splithost(rest)
Fred Drake567ca8e2000-08-21 21:42:42 +0000345 if realhost:
346 user_passwd, realhost = splituser(realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000347 if user_passwd:
348 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000349 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000350 if not host: raise IOError, ('https error', 'no host given')
351 if user_passwd:
352 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000353 auth = base64.encodestring(user_passwd).strip()
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000354 else:
355 auth = None
356 h = httplib.HTTPS(host, 0,
357 key_file=self.key_file,
358 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000359 if data is not None:
360 h.putrequest('POST', selector)
361 h.putheader('Content-type',
362 'application/x-www-form-urlencoded')
363 h.putheader('Content-length', '%d' % len(data))
364 else:
365 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000366 if auth: h.putheader('Authorization: Basic %s' % auth)
Fred Drake567ca8e2000-08-21 21:42:42 +0000367 if realhost: h.putheader('Host', realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000368 for args in self.addheaders: apply(h.putheader, args)
369 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000370 if data is not None:
Fred Drakeec3dfde2001-07-04 05:18:29 +0000371 h.send(data)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000372 errcode, errmsg, headers = h.getreply()
373 fp = h.getfile()
374 if errcode == 200:
Guido van Rossumb931bf32001-12-08 17:09:07 +0000375 return addinfourl(fp, headers, "https:" + url)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000376 else:
Fred Drake567ca8e2000-08-21 21:42:42 +0000377 if data is None:
378 return self.http_error(url, fp, errcode, errmsg, headers)
379 else:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000380 return self.http_error(url, fp, errcode, errmsg, headers,
381 data)
Fred Drake567ca8e2000-08-21 21:42:42 +0000382
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000383 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000384 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000385 import gopherlib
386 host, selector = splithost(url)
387 if not host: raise IOError, ('gopher error', 'no host given')
388 host = unquote(host)
389 type, selector = splitgophertype(selector)
390 selector, query = splitquery(selector)
391 selector = unquote(selector)
392 if query:
393 query = unquote(query)
394 fp = gopherlib.send_query(selector, query, host)
395 else:
396 fp = gopherlib.send_selector(selector, host)
397 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000398
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000399 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000400 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000401 if url[:2] == '//' and url[2:3] != '/':
402 return self.open_ftp(url)
403 else:
404 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000405
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000406 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000407 """Use local file."""
Jeremy Hylton6d8c1aa2001-08-27 20:16:53 +0000408 import mimetypes, mimetools, rfc822, StringIO
Guido van Rossumf0713d32001-08-09 17:43:35 +0000409 host, file = splithost(url)
410 localname = url2pathname(file)
411 stats = os.stat(localname)
Walter Dörwald92b48b72002-03-22 17:30:38 +0000412 size = stats.st_size
413 modified = rfc822.formatdate(stats.st_mtime)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000414 mtype = mimetypes.guess_type(url)[0]
415 headers = mimetools.Message(StringIO.StringIO(
Guido van Rossumf0713d32001-08-09 17:43:35 +0000416 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
417 (mtype or 'text/plain', size, modified)))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000418 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000419 urlfile = file
420 if file[:1] == '/':
421 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000422 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000423 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000424 host, port = splitport(host)
425 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000426 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000427 urlfile = file
428 if file[:1] == '/':
429 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000430 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000431 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000432 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000433
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000434 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000435 """Use FTP protocol."""
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000436 import mimetypes, mimetools, StringIO
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000437 host, path = splithost(url)
438 if not host: raise IOError, ('ftp error', 'no host given')
439 host, port = splitport(host)
440 user, host = splituser(host)
441 if user: user, passwd = splitpasswd(user)
442 else: passwd = None
443 host = unquote(host)
444 user = unquote(user or '')
445 passwd = unquote(passwd or '')
446 host = socket.gethostbyname(host)
447 if not port:
448 import ftplib
449 port = ftplib.FTP_PORT
450 else:
451 port = int(port)
452 path, attrs = splitattr(path)
453 path = unquote(path)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000454 dirs = path.split('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000455 dirs, file = dirs[:-1], dirs[-1]
456 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000457 if dirs and not dirs[0]: dirs[0] = '/'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000458 key = user, host, port, '/'.join(dirs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000459 # XXX thread unsafe!
460 if len(self.ftpcache) > MAXFTPCACHE:
461 # Prune the cache, rather arbitrarily
462 for k in self.ftpcache.keys():
463 if k != key:
464 v = self.ftpcache[k]
465 del self.ftpcache[k]
466 v.close()
467 try:
468 if not self.ftpcache.has_key(key):
469 self.ftpcache[key] = \
470 ftpwrapper(user, passwd, host, port, dirs)
471 if not file: type = 'D'
472 else: type = 'I'
473 for attr in attrs:
474 attr, value = splitvalue(attr)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000475 if attr.lower() == 'type' and \
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000476 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000477 type = value.upper()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000478 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000479 mtype = mimetypes.guess_type("ftp:" + url)[0]
480 headers = ""
481 if mtype:
482 headers += "Content-Type: %s\n" % mtype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000483 if retrlen is not None and retrlen >= 0:
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000484 headers += "Content-Length: %d\n" % retrlen
485 headers = mimetools.Message(StringIO.StringIO(headers))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000486 return addinfourl(fp, headers, "ftp:" + url)
487 except ftperrors(), msg:
488 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000489
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000490 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000491 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000492 # ignore POSTed data
493 #
494 # syntax of data URLs:
495 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
496 # mediatype := [ type "/" subtype ] *( ";" parameter )
497 # data := *urlchar
498 # parameter := attribute "=" value
Neal Norwitzaad18492002-03-26 16:25:01 +0000499 import StringIO, mimetools
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000500 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000501 [type, data] = url.split(',', 1)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000502 except ValueError:
503 raise IOError, ('data error', 'bad data URL')
504 if not type:
505 type = 'text/plain;charset=US-ASCII'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000506 semi = type.rfind(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000507 if semi >= 0 and '=' not in type[semi:]:
508 encoding = type[semi+1:]
509 type = type[:semi]
510 else:
511 encoding = ''
512 msg = []
513 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
514 time.gmtime(time.time())))
515 msg.append('Content-type: %s' % type)
516 if encoding == 'base64':
517 import base64
518 data = base64.decodestring(data)
519 else:
520 data = unquote(data)
521 msg.append('Content-length: %d' % len(data))
522 msg.append('')
523 msg.append(data)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000524 msg = '\n'.join(msg)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000525 f = StringIO.StringIO(msg)
526 headers = mimetools.Message(f, 0)
527 f.fileno = None # needed for addinfourl
528 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000529
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000530
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000531class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000532 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000533
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000534 def __init__(self, *args):
535 apply(URLopener.__init__, (self,) + args)
536 self.auth_cache = {}
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000537 self.tries = 0
538 self.maxtries = 10
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000539
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000540 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000541 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000542 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000543
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000544 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000545 """Error 302 -- relocated (temporarily)."""
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000546 self.tries += 1
547 if self.maxtries and self.tries >= self.maxtries:
548 if hasattr(self, "http_error_500"):
549 meth = self.http_error_500
550 else:
551 meth = self.http_error_default
552 self.tries = 0
553 return meth(url, fp, 500,
554 "Internal Server Error: Redirect Recursion", headers)
555 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
556 data)
557 self.tries = 0
558 return result
559
560 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000561 if headers.has_key('location'):
562 newurl = headers['location']
563 elif headers.has_key('uri'):
564 newurl = headers['uri']
565 else:
566 return
567 void = fp.read()
568 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000569 # In case the server sent a relative URL, join with original:
Moshe Zadka5d87d472001-04-09 14:54:21 +0000570 newurl = basejoin(self.type + ":" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000571 if data is None:
572 return self.open(newurl)
573 else:
574 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000575
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000576 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000577 """Error 301 -- also relocated (permanently)."""
578 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000579
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000580 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000581 """Error 401 -- authentication required.
582 See this URL for a description of the basic authentication scheme:
583 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Moshe Zadkae99bd172001-02-27 06:27:04 +0000584 if not headers.has_key('www-authenticate'):
Tim Peters85ba6732001-02-28 08:26:44 +0000585 URLopener.http_error_default(self, url, fp,
Fred Drakec680ae82001-10-13 18:37:07 +0000586 errcode, errmsg, headers)
Moshe Zadkae99bd172001-02-27 06:27:04 +0000587 stuff = headers['www-authenticate']
588 import re
589 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
590 if not match:
Tim Peters85ba6732001-02-28 08:26:44 +0000591 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000592 errcode, errmsg, headers)
593 scheme, realm = match.groups()
594 if scheme.lower() != 'basic':
Tim Peters85ba6732001-02-28 08:26:44 +0000595 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000596 errcode, errmsg, headers)
597 name = 'retry_' + self.type + '_basic_auth'
598 if data is None:
599 return getattr(self,name)(url, realm)
600 else:
601 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000602
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000603 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000604 host, selector = splithost(url)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000605 i = host.find('@') + 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000606 host = host[i:]
607 user, passwd = self.get_user_passwd(host, realm, i)
608 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000609 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000610 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000611 if data is None:
612 return self.open(newurl)
613 else:
614 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000615
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000616 def retry_https_basic_auth(self, url, realm, data=None):
Tim Peterse1190062001-01-15 03:34:38 +0000617 host, selector = splithost(url)
618 i = host.find('@') + 1
619 host = host[i:]
620 user, passwd = self.get_user_passwd(host, realm, i)
621 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000622 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Tim Peterse1190062001-01-15 03:34:38 +0000623 newurl = '//' + host + selector
Guido van Rossumafc4f042001-01-15 18:31:13 +0000624 return self.open_https(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000625
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000626 def get_user_passwd(self, host, realm, clear_cache = 0):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000627 key = realm + '@' + host.lower()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000628 if self.auth_cache.has_key(key):
629 if clear_cache:
630 del self.auth_cache[key]
631 else:
632 return self.auth_cache[key]
633 user, passwd = self.prompt_user_passwd(host, realm)
634 if user or passwd: self.auth_cache[key] = (user, passwd)
635 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000636
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000637 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000638 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000639 import getpass
640 try:
641 user = raw_input("Enter username for %s at %s: " % (realm,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000642 host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000643 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
644 (user, realm, host))
645 return user, passwd
646 except KeyboardInterrupt:
647 print
648 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000649
650
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000651# Utility functions
652
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000653_localhost = None
654def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000655 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000656 global _localhost
657 if not _localhost:
658 _localhost = socket.gethostbyname('localhost')
659 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000660
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000661_thishost = None
662def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000663 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000664 global _thishost
665 if not _thishost:
666 _thishost = socket.gethostbyname(socket.gethostname())
667 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000668
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000669_ftperrors = None
670def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000671 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000672 global _ftperrors
673 if not _ftperrors:
674 import ftplib
675 _ftperrors = ftplib.all_errors
676 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000677
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000678_noheaders = None
679def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000680 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000681 global _noheaders
682 if not _noheaders:
683 import mimetools
684 import StringIO
685 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
686 _noheaders.fp.close() # Recycle file descriptor
687 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000688
689
690# Utility classes
691
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000692class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000693 """Class used by open_ftp() for cache of open FTP connections."""
694
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000695 def __init__(self, user, passwd, host, port, dirs):
696 self.user = user
697 self.passwd = passwd
698 self.host = host
699 self.port = port
700 self.dirs = dirs
701 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000702
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000703 def init(self):
704 import ftplib
705 self.busy = 0
706 self.ftp = ftplib.FTP()
707 self.ftp.connect(self.host, self.port)
708 self.ftp.login(self.user, self.passwd)
709 for dir in self.dirs:
710 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000711
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000712 def retrfile(self, file, type):
713 import ftplib
714 self.endtransfer()
715 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
716 else: cmd = 'TYPE ' + type; isdir = 0
717 try:
718 self.ftp.voidcmd(cmd)
719 except ftplib.all_errors:
720 self.init()
721 self.ftp.voidcmd(cmd)
722 conn = None
723 if file and not isdir:
724 # Use nlst to see if the file exists at all
725 try:
726 self.ftp.nlst(file)
727 except ftplib.error_perm, reason:
728 raise IOError, ('ftp error', reason), sys.exc_info()[2]
729 # Restore the transfer mode!
730 self.ftp.voidcmd(cmd)
731 # Try to retrieve as a file
732 try:
733 cmd = 'RETR ' + file
734 conn = self.ftp.ntransfercmd(cmd)
735 except ftplib.error_perm, reason:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000736 if str(reason)[:3] != '550':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000737 raise IOError, ('ftp error', reason), sys.exc_info()[2]
738 if not conn:
739 # Set transfer mode to ASCII!
740 self.ftp.voidcmd('TYPE A')
741 # Try a directory listing
742 if file: cmd = 'LIST ' + file
743 else: cmd = 'LIST'
744 conn = self.ftp.ntransfercmd(cmd)
745 self.busy = 1
746 # Pass back both a suitably decorated object and a retrieval length
747 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000748 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000749 def endtransfer(self):
750 if not self.busy:
751 return
752 self.busy = 0
753 try:
754 self.ftp.voidresp()
755 except ftperrors():
756 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000757
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000758 def close(self):
759 self.endtransfer()
760 try:
761 self.ftp.close()
762 except ftperrors():
763 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000764
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000765class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000766 """Base class for addinfo and addclosehook."""
767
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000768 def __init__(self, fp):
769 self.fp = fp
770 self.read = self.fp.read
771 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000772 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
773 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000774
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000775 def __repr__(self):
776 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000777 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000778
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000779 def close(self):
780 self.read = None
781 self.readline = None
782 self.readlines = None
783 self.fileno = None
784 if self.fp: self.fp.close()
785 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000786
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000787class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000788 """Class to add a close hook to an open file."""
789
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000790 def __init__(self, fp, closehook, *hookargs):
791 addbase.__init__(self, fp)
792 self.closehook = closehook
793 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000794
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000795 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000796 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000797 if self.closehook:
798 apply(self.closehook, self.hookargs)
799 self.closehook = None
800 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000801
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000802class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000803 """class to add an info() method to an open file."""
804
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000805 def __init__(self, fp, headers):
806 addbase.__init__(self, fp)
807 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000808
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000809 def info(self):
810 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000811
Guido van Rossume6ad8911996-09-10 17:02:56 +0000812class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000813 """class to add info() and geturl() methods to an open file."""
814
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000815 def __init__(self, fp, headers, url):
816 addbase.__init__(self, fp)
817 self.headers = headers
818 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000819
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000820 def info(self):
821 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000822
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000823 def geturl(self):
824 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000825
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000826
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000827def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000828 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000829 type, path = splittype(url)
830 if type:
831 # if url is complete (i.e., it contains a type), return it
832 return url
833 host, path = splithost(path)
834 type, basepath = splittype(base) # inherit type from base
835 if host:
836 # if url contains host, just inherit type
837 if type: return type + '://' + host + path
838 else:
839 # no type inherited, so url must have started with //
840 # just return it
841 return url
842 host, basepath = splithost(basepath) # inherit host
Thomas Wouters7e474022000-07-16 12:04:32 +0000843 basepath, basetag = splittag(basepath) # remove extraneous cruft
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000844 basepath, basequery = splitquery(basepath) # idem
845 if path[:1] != '/':
846 # non-absolute path name
847 if path[:1] in ('#', '?'):
848 # path is just a tag or query, attach to basepath
849 i = len(basepath)
850 else:
851 # else replace last component
Guido van Rossumb2493f82000-12-15 15:01:37 +0000852 i = basepath.rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000853 if i < 0:
854 # basepath not absolute
855 if host:
856 # host present, make absolute
857 basepath = '/'
858 else:
859 # else keep non-absolute
860 basepath = ''
861 else:
862 # remove last file component
863 basepath = basepath[:i+1]
864 # Interpret ../ (important because of symlinks)
865 while basepath and path[:3] == '../':
866 path = path[3:]
Guido van Rossumb2493f82000-12-15 15:01:37 +0000867 i = basepath[:-1].rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000868 if i > 0:
869 basepath = basepath[:i+1]
870 elif i == 0:
871 basepath = '/'
872 break
873 else:
874 basepath = ''
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000875
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000876 path = basepath + path
Guido van Rossumb8bf3be2001-04-15 20:47:33 +0000877 if host and path and path[0] != '/':
878 path = '/' + path
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000879 if type and host: return type + '://' + host + path
880 elif type: return type + ':' + path
881 elif host: return '//' + host + path # don't know what this means
882 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000883
884
Guido van Rossum7c395db1994-07-04 22:14:49 +0000885# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000886# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000887# splittype('type:opaquestring') --> 'type', 'opaquestring'
888# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000889# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
890# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000891# splitport('host:port') --> 'host', 'port'
892# splitquery('/path?query') --> '/path', 'query'
893# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000894# splitattr('/path;attr1=value1;attr2=value2;...') ->
895# '/path', ['attr1=value1', 'attr2=value2', ...]
896# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000897# splitgophertype('/Xselector') --> 'X', 'selector'
898# unquote('abc%20def') -> 'abc def'
899# quote('abc def') -> 'abc%20def')
900
Martin v. Löwis1d994332000-12-03 18:30:10 +0000901def toBytes(url):
902 """toBytes(u"URL") --> 'URL'."""
903 # Most URL schemes require ASCII. If that changes, the conversion
904 # can be relaxed
905 if type(url) is types.UnicodeType:
906 try:
907 url = url.encode("ASCII")
908 except UnicodeError:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000909 raise UnicodeError("URL " + repr(url) +
910 " contains non-ASCII characters")
Martin v. Löwis1d994332000-12-03 18:30:10 +0000911 return url
912
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000913def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000914 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Guido van Rossumb2493f82000-12-15 15:01:37 +0000915 url = url.strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000916 if url[:1] == '<' and url[-1:] == '>':
Guido van Rossumb2493f82000-12-15 15:01:37 +0000917 url = url[1:-1].strip()
918 if url[:4] == 'URL:': url = url[4:].strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000919 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000920
Guido van Rossum332e1441997-09-29 23:23:46 +0000921_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000922def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000923 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000924 global _typeprog
925 if _typeprog is None:
926 import re
927 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000928
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000929 match = _typeprog.match(url)
930 if match:
931 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000932 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000933 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000934
Guido van Rossum332e1441997-09-29 23:23:46 +0000935_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000936def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000937 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000938 global _hostprog
939 if _hostprog is None:
940 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000941 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000942
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000943 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000944 if match: return match.group(1, 2)
945 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000946
Guido van Rossum332e1441997-09-29 23:23:46 +0000947_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000948def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000949 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000950 global _userprog
951 if _userprog is None:
952 import re
953 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000954
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000955 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000956 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000957 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000958
Guido van Rossum332e1441997-09-29 23:23:46 +0000959_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000960def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000961 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000962 global _passwdprog
963 if _passwdprog is None:
964 import re
965 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000966
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000967 match = _passwdprog.match(user)
968 if match: return match.group(1, 2)
969 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000970
Guido van Rossume7b146f2000-02-04 15:28:42 +0000971# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000972_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000973def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000974 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000975 global _portprog
976 if _portprog is None:
977 import re
978 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000979
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000980 match = _portprog.match(host)
981 if match: return match.group(1, 2)
982 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000983
Guido van Rossum332e1441997-09-29 23:23:46 +0000984_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000985def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000986 """Split host and port, returning numeric port.
987 Return given default port if no ':' found; defaults to -1.
988 Return numerical port if a valid number are found after ':'.
989 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000990 global _nportprog
991 if _nportprog is None:
992 import re
993 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000994
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000995 match = _nportprog.match(host)
996 if match:
997 host, port = match.group(1, 2)
998 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000999 if not port: raise ValueError, "no digits"
1000 nport = int(port)
1001 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001002 nport = None
1003 return host, nport
1004 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +00001005
Guido van Rossum332e1441997-09-29 23:23:46 +00001006_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001007def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001008 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001009 global _queryprog
1010 if _queryprog is None:
1011 import re
1012 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001013
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001014 match = _queryprog.match(url)
1015 if match: return match.group(1, 2)
1016 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001017
Guido van Rossum332e1441997-09-29 23:23:46 +00001018_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001019def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001020 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001021 global _tagprog
1022 if _tagprog is None:
1023 import re
1024 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001025
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001026 match = _tagprog.match(url)
1027 if match: return match.group(1, 2)
1028 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001029
Guido van Rossum7c395db1994-07-04 22:14:49 +00001030def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001031 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1032 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Guido van Rossumb2493f82000-12-15 15:01:37 +00001033 words = url.split(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001034 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +00001035
Guido van Rossum332e1441997-09-29 23:23:46 +00001036_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001037def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001038 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001039 global _valueprog
1040 if _valueprog is None:
1041 import re
1042 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001043
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001044 match = _valueprog.match(attr)
1045 if match: return match.group(1, 2)
1046 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001047
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001048def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001049 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001050 if selector[:1] == '/' and selector[1:2]:
1051 return selector[1], selector[2:]
1052 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001053
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001054def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001055 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001056 mychr = chr
Guido van Rossumb2493f82000-12-15 15:01:37 +00001057 myatoi = int
1058 list = s.split('%')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001059 res = [list[0]]
1060 myappend = res.append
1061 del list[0]
1062 for item in list:
1063 if item[1:2]:
1064 try:
1065 myappend(mychr(myatoi(item[:2], 16))
1066 + item[2:])
Martin v. Löwis58682b72001-08-11 15:02:57 +00001067 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001068 myappend('%' + item)
1069 else:
1070 myappend('%' + item)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001071 return "".join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001072
Guido van Rossum0564e121996-12-13 14:47:36 +00001073def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001074 """unquote('%7e/abc+def') -> '~/abc def'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001075 if '+' in s:
1076 # replace '+' with ' '
Guido van Rossumb2493f82000-12-15 15:01:37 +00001077 s = ' '.join(s.split('+'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001078 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001079
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001080always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001081 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001082 '0123456789' '_.-')
1083
1084_fast_safe_test = always_safe + '/'
1085_fast_safe = None
1086
1087def _fast_quote(s):
1088 global _fast_safe
1089 if _fast_safe is None:
1090 _fast_safe = {}
1091 for c in _fast_safe_test:
1092 _fast_safe[c] = c
1093 res = list(s)
1094 for i in range(len(res)):
1095 c = res[i]
1096 if not _fast_safe.has_key(c):
Guido van Rossume27a7b82001-01-19 03:28:15 +00001097 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001098 return ''.join(res)
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001099
Guido van Rossum7c395db1994-07-04 22:14:49 +00001100def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001101 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001102
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001103 Each part of a URL, e.g. the path info, the query, etc., has a
1104 different set of reserved characters that must be quoted.
1105
1106 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1107 the following reserved characters.
1108
1109 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1110 "$" | ","
1111
1112 Each of these characters is reserved in some component of a URL,
1113 but not necessarily in all of them.
1114
1115 By default, the quote function is intended for quoting the path
1116 section of a URL. Thus, it will not encode '/'. This character
1117 is reserved, but in typical usage the quote function is being
1118 called on a path where the existing slash characters are used as
1119 reserved characters.
1120 """
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001121 safe = always_safe + safe
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001122 if _fast_safe_test == safe:
1123 return _fast_quote(s)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001124 res = list(s)
1125 for i in range(len(res)):
1126 c = res[i]
1127 if c not in safe:
Guido van Rossume27a7b82001-01-19 03:28:15 +00001128 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001129 return ''.join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001130
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001131def quote_plus(s, safe = ''):
1132 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001133 if ' ' in s:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001134 l = s.split(' ')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001135 for i in range(len(l)):
1136 l[i] = quote(l[i], safe)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001137 return '+'.join(l)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001138 else:
1139 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001140
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001141def urlencode(query,doseq=0):
1142 """Encode a sequence of two-element tuples or dictionary into a URL query string.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001143
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001144 If any values in the query arg are sequences and doseq is true, each
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001145 sequence element is converted to a separate parameter.
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001146
1147 If the query arg is a sequence of two-element tuples, the order of the
1148 parameters in the output will match the order of parameters in the
1149 input.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001150 """
Tim Peters658cba62001-02-09 20:06:00 +00001151
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001152 if hasattr(query,"items"):
1153 # mapping objects
1154 query = query.items()
1155 else:
1156 # it's a bother at times that strings and string-like objects are
1157 # sequences...
1158 try:
1159 # non-sequence items should not work with len()
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001160 # non-empty strings will fail this
1161 if len(query) and type(query[0]) != types.TupleType:
1162 raise TypeError
1163 # zero-length sequences of all types will get here and succeed,
1164 # but that's a minor nit - since the original implementation
1165 # allowed empty dicts that type of behavior probably should be
1166 # preserved for consistency
1167 except TypeError:
1168 ty,va,tb = sys.exc_info()
1169 raise TypeError, "not a valid non-string sequence or mapping object", tb
1170
Guido van Rossume7b146f2000-02-04 15:28:42 +00001171 l = []
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001172 if not doseq:
1173 # preserve old behavior
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001174 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001175 k = quote_plus(str(k))
1176 v = quote_plus(str(v))
1177 l.append(k + '=' + v)
1178 else:
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001179 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001180 k = quote_plus(str(k))
1181 if type(v) == types.StringType:
1182 v = quote_plus(v)
1183 l.append(k + '=' + v)
1184 elif type(v) == types.UnicodeType:
1185 # is there a reasonable way to convert to ASCII?
1186 # encode generates a string, but "replace" or "ignore"
1187 # lose information and "strict" can raise UnicodeError
1188 v = quote_plus(v.encode("ASCII","replace"))
1189 l.append(k + '=' + v)
1190 else:
1191 try:
1192 # is this a sufficient test for sequence-ness?
1193 x = len(v)
1194 except TypeError:
1195 # not a sequence
1196 v = quote_plus(str(v))
1197 l.append(k + '=' + v)
1198 else:
1199 # loop over the sequence
1200 for elt in v:
1201 l.append(k + '=' + quote_plus(str(elt)))
Guido van Rossumb2493f82000-12-15 15:01:37 +00001202 return '&'.join(l)
Guido van Rossum810a3391998-07-22 21:33:23 +00001203
Guido van Rossum442e7201996-03-20 15:33:11 +00001204# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001205def getproxies_environment():
1206 """Return a dictionary of scheme -> proxy server URL mappings.
1207
1208 Scan the environment for variables named <scheme>_proxy;
1209 this seems to be the standard convention. If you need a
1210 different way, you can pass a proxies dictionary to the
1211 [Fancy]URLopener constructor.
1212
1213 """
1214 proxies = {}
1215 for name, value in os.environ.items():
Guido van Rossumb2493f82000-12-15 15:01:37 +00001216 name = name.lower()
Mark Hammond4f570b92000-07-26 07:04:38 +00001217 if value and name[-6:] == '_proxy':
1218 proxies[name[:-6]] = value
1219 return proxies
1220
Guido van Rossum4163e701998-08-06 13:39:09 +00001221if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001222 def getproxies():
1223 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001224
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001225 By convention the mac uses Internet Config to store
1226 proxies. An HTTP proxy, for instance, is stored under
1227 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001228
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001229 """
1230 try:
1231 import ic
1232 except ImportError:
1233 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001234
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001235 try:
1236 config = ic.IC()
1237 except ic.error:
1238 return {}
1239 proxies = {}
1240 # HTTP:
1241 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1242 try:
1243 value = config['HTTPProxyHost']
1244 except ic.error:
1245 pass
1246 else:
1247 proxies['http'] = 'http://%s' % value
1248 # FTP: XXXX To be done.
1249 # Gopher: XXXX To be done.
1250 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001251
Tim Peters55c12d42001-08-09 18:04:14 +00001252 def proxy_bypass(x):
1253 return 0
1254
Mark Hammond4f570b92000-07-26 07:04:38 +00001255elif os.name == 'nt':
1256 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001257 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001258
1259 Win32 uses the registry to store proxies.
1260
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001261 """
1262 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001263 try:
1264 import _winreg
1265 except ImportError:
1266 # Std module, so should be around - but you never know!
1267 return proxies
1268 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001269 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1270 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001271 proxyEnable = _winreg.QueryValueEx(internetSettings,
1272 'ProxyEnable')[0]
1273 if proxyEnable:
1274 # Returned as Unicode but problems if not converted to ASCII
1275 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1276 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001277 if '=' in proxyServer:
1278 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001279 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001280 protocol, address = p.split('=', 1)
Mark Hammond4f570b92000-07-26 07:04:38 +00001281 proxies[protocol] = '%s://%s' % (protocol, address)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001282 else:
1283 # Use one setting for all protocols
1284 if proxyServer[:5] == 'http:':
1285 proxies['http'] = proxyServer
1286 else:
1287 proxies['http'] = 'http://%s' % proxyServer
1288 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001289 internetSettings.Close()
1290 except (WindowsError, ValueError, TypeError):
1291 # Either registry key not found etc, or the value in an
1292 # unexpected format.
1293 # proxies already set up to be empty so nothing to do
1294 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001295 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001296
Mark Hammond4f570b92000-07-26 07:04:38 +00001297 def getproxies():
1298 """Return a dictionary of scheme -> proxy server URL mappings.
1299
1300 Returns settings gathered from the environment, if specified,
1301 or the registry.
1302
1303 """
1304 return getproxies_environment() or getproxies_registry()
Tim Peters55c12d42001-08-09 18:04:14 +00001305
1306 def proxy_bypass(host):
1307 try:
1308 import _winreg
1309 import re
Tim Peters55c12d42001-08-09 18:04:14 +00001310 except ImportError:
1311 # Std modules, so should be around - but you never know!
1312 return 0
1313 try:
1314 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1315 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1316 proxyEnable = _winreg.QueryValueEx(internetSettings,
1317 'ProxyEnable')[0]
1318 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1319 'ProxyOverride')[0])
1320 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1321 except WindowsError:
1322 return 0
1323 if not proxyEnable or not proxyOverride:
1324 return 0
1325 # try to make a host list from name and IP address.
1326 host = [host]
1327 try:
1328 addr = socket.gethostbyname(host[0])
1329 if addr != host:
1330 host.append(addr)
1331 except socket.error:
1332 pass
1333 # make a check value list from the registry entry: replace the
1334 # '<local>' string by the localhost entry and the corresponding
1335 # canonical entry.
1336 proxyOverride = proxyOverride.split(';')
1337 i = 0
1338 while i < len(proxyOverride):
1339 if proxyOverride[i] == '<local>':
1340 proxyOverride[i:i+1] = ['localhost',
1341 '127.0.0.1',
1342 socket.gethostname(),
1343 socket.gethostbyname(
1344 socket.gethostname())]
1345 i += 1
1346 # print proxyOverride
1347 # now check if we match one of the registry values.
1348 for test in proxyOverride:
Tim Petersab9ba272001-08-09 21:40:30 +00001349 test = test.replace(".", r"\.") # mask dots
1350 test = test.replace("*", r".*") # change glob sequence
1351 test = test.replace("?", r".") # change glob char
Tim Peters55c12d42001-08-09 18:04:14 +00001352 for val in host:
1353 # print "%s <--> %s" %( test, val )
1354 if re.match(test, val, re.I):
1355 return 1
1356 return 0
1357
Mark Hammond4f570b92000-07-26 07:04:38 +00001358else:
1359 # By default use environment variables
1360 getproxies = getproxies_environment
1361
Tim Peters55c12d42001-08-09 18:04:14 +00001362 def proxy_bypass(host):
1363 return 0
Guido van Rossum442e7201996-03-20 15:33:11 +00001364
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001365# Test and time quote() and unquote()
1366def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001367 s = ''
1368 for i in range(256): s = s + chr(i)
1369 s = s*4
1370 t0 = time.time()
1371 qs = quote(s)
1372 uqs = unquote(qs)
1373 t1 = time.time()
1374 if uqs != s:
1375 print 'Wrong!'
1376 print `s`
1377 print `qs`
1378 print `uqs`
1379 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001380
1381
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001382def reporthook(blocknum, blocksize, totalsize):
1383 # Report during remote transfers
Guido van Rossumb2493f82000-12-15 15:01:37 +00001384 print "Block number: %d, Block size: %d, Total size: %d" % (
1385 blocknum, blocksize, totalsize)
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001386
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001387# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001388def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001389 if not args:
1390 args = [
1391 '/etc/passwd',
1392 'file:/etc/passwd',
1393 'file://localhost/etc/passwd',
Andrew M. Kuchling56a42352002-03-18 22:18:46 +00001394 'ftp://ftp.python.org/pub/python/README',
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001395## 'gopher://gopher.micro.umn.edu/1/',
1396 'http://www.python.org/index.html',
1397 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001398 if hasattr(URLopener, "open_https"):
1399 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001400 try:
1401 for url in args:
1402 print '-'*10, url, '-'*10
1403 fn, h = urlretrieve(url, None, reporthook)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001404 print fn
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001405 if h:
1406 print '======'
1407 for k in h.keys(): print k + ':', h[k]
1408 print '======'
1409 fp = open(fn, 'rb')
1410 data = fp.read()
1411 del fp
1412 if '\r' in data:
1413 table = string.maketrans("", "")
Guido van Rossumb2493f82000-12-15 15:01:37 +00001414 data = data.translate(table, "\r")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001415 print data
1416 fn, h = None, None
1417 print '-'*40
1418 finally:
1419 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001420
Guido van Rossum23490151998-06-25 02:39:00 +00001421def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001422 import getopt, sys
1423 try:
1424 opts, args = getopt.getopt(sys.argv[1:], "th")
1425 except getopt.error, msg:
1426 print msg
1427 print "Use -h for help"
1428 return
1429 t = 0
1430 for o, a in opts:
1431 if o == '-t':
1432 t = t + 1
1433 if o == '-h':
1434 print "Usage: python urllib.py [-t] [url ...]"
1435 print "-t runs self-test;",
1436 print "otherwise, contents of urls are printed"
1437 return
1438 if t:
1439 if t > 1:
1440 test1()
1441 test(args)
1442 else:
1443 if not args:
1444 print "Use -h for help"
1445 for url in args:
1446 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001447
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001448# Run test program when run as a script
1449if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001450 main()