blob: 53005c8d8d5a17704987a71e2b6cedab6a3209b3 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossum3c8484e1996-11-20 22:02:24 +000028import sys
Martin v. Löwis1d994332000-12-03 18:30:10 +000029import types
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000030
Skip Montanaro40fc1602001-03-01 04:27:19 +000031__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
Skip Montanaro44d5e0c2001-03-13 19:47:16 +000033 "urlencode", "url2pathname", "pathname2url", "splittag",
34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36 "splitnport", "splitquery", "splitattr", "splitvalue",
37 "splitgophertype", "getproxies"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000038
Guido van Rossumb2493f82000-12-15 15:01:37 +000039__version__ = '1.15' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000040
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000041MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000042
Jack Jansendc3e3f61995-12-15 13:22:13 +000043# Helper for non-unix systems
44if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000045 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000046elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000047 from nturl2path import url2pathname, pathname2url
Guido van Rossumd74fb6b2001-03-02 06:43:49 +000048elif os.name == 'riscos':
49 from rourl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000050else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000051 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000052 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000053 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000054 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000055
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000056# This really consists of two pieces:
57# (1) a class which handles opening of all sorts of URLs
58# (plus assorted utilities etc.)
59# (2) a set of functions for parsing URLs
60# XXX Should these be separated out into different modules?
61
62
63# Shortcut for basic usage
64_urlopener = None
Guido van Rossumbd013741996-12-10 16:00:28 +000065def urlopen(url, data=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000066 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000067 global _urlopener
68 if not _urlopener:
69 _urlopener = FancyURLopener()
70 if data is None:
71 return _urlopener.open(url)
72 else:
73 return _urlopener.open(url, data)
Fred Drake316a7932000-08-24 01:01:26 +000074def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000075 global _urlopener
76 if not _urlopener:
77 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000078 return _urlopener.retrieve(url, filename, reporthook, data)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000079def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000080 if _urlopener:
81 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000082
83
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000084ftpcache = {}
85class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000086 """Class to open URLs.
87 This is a class rather than just a subroutine because we may need
88 more than one set of global protocol-specific options.
89 Note -- this is a base class for those who don't want the
90 automatic handling of errors type 302 (relocated) and 401
91 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000092
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000093 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000094
Guido van Rossumba311382000-08-24 16:18:04 +000095 version = "Python-urllib/%s" % __version__
96
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000097 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000098 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000099 if proxies is None:
100 proxies = getproxies()
101 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
102 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000103 self.key_file = x509.get('key_file')
104 self.cert_file = x509.get('cert_file')
Guido van Rossumba311382000-08-24 16:18:04 +0000105 self.addheaders = [('User-agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000106 self.__tempfiles = []
107 self.__unlink = os.unlink # See cleanup()
108 self.tempcache = None
109 # Undocumented feature: if you assign {} to tempcache,
110 # it is used to cache files retrieved with
111 # self.retrieve(). This is not enabled by default
112 # since it does not work for changing documents (and I
113 # haven't got the logic to check expiration headers
114 # yet).
115 self.ftpcache = ftpcache
116 # Undocumented feature: you can use a different
117 # ftp cache by assigning to the .ftpcache member;
118 # in case you want logically independent URL openers
119 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000120
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000121 def __del__(self):
122 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000123
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000124 def close(self):
125 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000126
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000127 def cleanup(self):
128 # This code sometimes runs when the rest of this module
129 # has already been deleted, so it can't use any globals
130 # or import anything.
131 if self.__tempfiles:
132 for file in self.__tempfiles:
133 try:
134 self.__unlink(file)
135 except:
136 pass
137 del self.__tempfiles[:]
138 if self.tempcache:
139 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000140
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000141 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000142 """Add a header to be used by the HTTP interface only
143 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000144 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000145
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000146 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000147 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000148 """Use URLopener().open(file) instead of open(file, 'r')."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000149 fullurl = unwrap(toBytes(fullurl))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000150 if self.tempcache and self.tempcache.has_key(fullurl):
151 filename, headers = self.tempcache[fullurl]
152 fp = open(filename, 'rb')
153 return addinfourl(fp, headers, fullurl)
Martin v. Löwis1d994332000-12-03 18:30:10 +0000154 urltype, url = splittype(fullurl)
155 if not urltype:
156 urltype = 'file'
157 if self.proxies.has_key(urltype):
158 proxy = self.proxies[urltype]
159 urltype, proxyhost = splittype(proxy)
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000160 host, selector = splithost(proxyhost)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000161 url = (host, fullurl) # Signal special case to open_*()
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000162 else:
163 proxy = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000164 name = 'open_' + urltype
165 self.type = urltype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000166 if '-' in name:
167 # replace - with _
Guido van Rossumb2493f82000-12-15 15:01:37 +0000168 name = '_'.join(name.split('-'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000169 if not hasattr(self, name):
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000170 if proxy:
171 return self.open_unknown_proxy(proxy, fullurl, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000172 else:
173 return self.open_unknown(fullurl, data)
174 try:
175 if data is None:
176 return getattr(self, name)(url)
177 else:
178 return getattr(self, name)(url, data)
179 except socket.error, msg:
180 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000181
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000182 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000183 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000184 type, url = splittype(fullurl)
185 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000186
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000187 def open_unknown_proxy(self, proxy, fullurl, data=None):
188 """Overridable interface to open unknown URL type."""
189 type, url = splittype(fullurl)
190 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
191
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000192 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000193 def retrieve(self, url, filename=None, reporthook=None, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000194 """retrieve(url) returns (filename, None) for a local object
195 or (tempfilename, headers) for a remote object."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000196 url = unwrap(toBytes(url))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000197 if self.tempcache and self.tempcache.has_key(url):
198 return self.tempcache[url]
199 type, url1 = splittype(url)
200 if not filename and (not type or type == 'file'):
201 try:
202 fp = self.open_local_file(url1)
203 hdrs = fp.info()
204 del fp
205 return url2pathname(splithost(url1)[1]), hdrs
206 except IOError, msg:
207 pass
Fred Drake316a7932000-08-24 01:01:26 +0000208 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000209 headers = fp.info()
210 if not filename:
211 import tempfile
212 garbage, path = splittype(url)
213 garbage, path = splithost(path or "")
214 path, garbage = splitquery(path or "")
215 path, garbage = splitattr(path or "")
216 suffix = os.path.splitext(path)[1]
217 filename = tempfile.mktemp(suffix)
218 self.__tempfiles.append(filename)
219 result = filename, headers
220 if self.tempcache is not None:
221 self.tempcache[url] = result
222 tfp = open(filename, 'wb')
223 bs = 1024*8
224 size = -1
225 blocknum = 1
226 if reporthook:
227 if headers.has_key("content-length"):
228 size = int(headers["Content-Length"])
229 reporthook(0, bs, size)
230 block = fp.read(bs)
231 if reporthook:
232 reporthook(1, bs, size)
233 while block:
234 tfp.write(block)
235 block = fp.read(bs)
236 blocknum = blocknum + 1
237 if reporthook:
238 reporthook(blocknum, bs, size)
239 fp.close()
240 tfp.close()
241 del fp
242 del tfp
243 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000244
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000245 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000246
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000247 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000248 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000249 import httplib
250 user_passwd = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000251 if type(url) is types.StringType:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000252 host, selector = splithost(url)
253 if host:
254 user_passwd, host = splituser(host)
255 host = unquote(host)
256 realhost = host
257 else:
258 host, selector = url
259 urltype, rest = splittype(selector)
260 url = rest
261 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000262 if urltype.lower() != 'http':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000263 realhost = None
264 else:
265 realhost, rest = splithost(rest)
266 if realhost:
267 user_passwd, realhost = splituser(realhost)
268 if user_passwd:
269 selector = "%s://%s%s" % (urltype, realhost, rest)
270 #print "proxy via http:", host, selector
271 if not host: raise IOError, ('http error', 'no host given')
272 if user_passwd:
273 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000274 auth = base64.encodestring(user_passwd).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000275 else:
276 auth = None
277 h = httplib.HTTP(host)
278 if data is not None:
279 h.putrequest('POST', selector)
280 h.putheader('Content-type', 'application/x-www-form-urlencoded')
281 h.putheader('Content-length', '%d' % len(data))
282 else:
283 h.putrequest('GET', selector)
284 if auth: h.putheader('Authorization', 'Basic %s' % auth)
285 if realhost: h.putheader('Host', realhost)
286 for args in self.addheaders: apply(h.putheader, args)
287 h.endheaders()
288 if data is not None:
289 h.send(data + '\r\n')
290 errcode, errmsg, headers = h.getreply()
291 fp = h.getfile()
292 if errcode == 200:
293 return addinfourl(fp, headers, "http:" + url)
294 else:
295 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000296 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000297 else:
298 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000299
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000300 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000301 """Handle http errors.
302 Derived class can override this, or provide specific handlers
303 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000304 # First check if there's a specific handler for this error
305 name = 'http_error_%d' % errcode
306 if hasattr(self, name):
307 method = getattr(self, name)
308 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000309 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000310 else:
311 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000312 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000313 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000314
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000315 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000316 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000317 void = fp.read()
318 fp.close()
319 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000320
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000321 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000322 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000323 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000324 import httplib
Fred Drake567ca8e2000-08-21 21:42:42 +0000325 user_passwd = None
Moshe Zadkab2a0a832001-01-08 07:09:25 +0000326 if type(url) is types.StringType:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000327 host, selector = splithost(url)
Fred Drake567ca8e2000-08-21 21:42:42 +0000328 if host:
329 user_passwd, host = splituser(host)
330 host = unquote(host)
331 realhost = host
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000332 else:
333 host, selector = url
334 urltype, rest = splittype(selector)
Fred Drake567ca8e2000-08-21 21:42:42 +0000335 url = rest
336 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000337 if urltype.lower() != 'https':
Fred Drake567ca8e2000-08-21 21:42:42 +0000338 realhost = None
339 else:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000340 realhost, rest = splithost(rest)
Fred Drake567ca8e2000-08-21 21:42:42 +0000341 if realhost:
342 user_passwd, realhost = splituser(realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000343 if user_passwd:
344 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000345 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000346 if not host: raise IOError, ('https error', 'no host given')
347 if user_passwd:
348 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000349 auth = base64.encodestring(user_passwd).strip()
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000350 else:
351 auth = None
352 h = httplib.HTTPS(host, 0,
353 key_file=self.key_file,
354 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000355 if data is not None:
356 h.putrequest('POST', selector)
357 h.putheader('Content-type',
358 'application/x-www-form-urlencoded')
359 h.putheader('Content-length', '%d' % len(data))
360 else:
361 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000362 if auth: h.putheader('Authorization: Basic %s' % auth)
Fred Drake567ca8e2000-08-21 21:42:42 +0000363 if realhost: h.putheader('Host', realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000364 for args in self.addheaders: apply(h.putheader, args)
365 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000366 if data is not None:
367 h.send(data + '\r\n')
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000368 errcode, errmsg, headers = h.getreply()
369 fp = h.getfile()
370 if errcode == 200:
371 return addinfourl(fp, headers, url)
372 else:
Fred Drake567ca8e2000-08-21 21:42:42 +0000373 if data is None:
374 return self.http_error(url, fp, errcode, errmsg, headers)
375 else:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000376 return self.http_error(url, fp, errcode, errmsg, headers,
377 data)
Fred Drake567ca8e2000-08-21 21:42:42 +0000378
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000379 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000380 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000381 import gopherlib
382 host, selector = splithost(url)
383 if not host: raise IOError, ('gopher error', 'no host given')
384 host = unquote(host)
385 type, selector = splitgophertype(selector)
386 selector, query = splitquery(selector)
387 selector = unquote(selector)
388 if query:
389 query = unquote(query)
390 fp = gopherlib.send_query(selector, query, host)
391 else:
392 fp = gopherlib.send_selector(selector, host)
393 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000394
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000395 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000396 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000397 if url[:2] == '//' and url[2:3] != '/':
398 return self.open_ftp(url)
399 else:
400 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000401
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000402 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000403 """Use local file."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000404 import mimetypes, mimetools, StringIO
405 mtype = mimetypes.guess_type(url)[0]
406 headers = mimetools.Message(StringIO.StringIO(
407 'Content-Type: %s\n' % (mtype or 'text/plain')))
408 host, file = splithost(url)
409 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000410 urlfile = file
411 if file[:1] == '/':
412 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000413 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000414 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000415 host, port = splitport(host)
416 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000417 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000418 urlfile = file
419 if file[:1] == '/':
420 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000421 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000422 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000423 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000424
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000425 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000426 """Use FTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000427 host, path = splithost(url)
428 if not host: raise IOError, ('ftp error', 'no host given')
429 host, port = splitport(host)
430 user, host = splituser(host)
431 if user: user, passwd = splitpasswd(user)
432 else: passwd = None
433 host = unquote(host)
434 user = unquote(user or '')
435 passwd = unquote(passwd or '')
436 host = socket.gethostbyname(host)
437 if not port:
438 import ftplib
439 port = ftplib.FTP_PORT
440 else:
441 port = int(port)
442 path, attrs = splitattr(path)
443 path = unquote(path)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000444 dirs = path.split('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000445 dirs, file = dirs[:-1], dirs[-1]
446 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000447 if dirs and not dirs[0]: dirs[0] = '/'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000448 key = user, host, port, '/'.join(dirs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000449 # XXX thread unsafe!
450 if len(self.ftpcache) > MAXFTPCACHE:
451 # Prune the cache, rather arbitrarily
452 for k in self.ftpcache.keys():
453 if k != key:
454 v = self.ftpcache[k]
455 del self.ftpcache[k]
456 v.close()
457 try:
458 if not self.ftpcache.has_key(key):
459 self.ftpcache[key] = \
460 ftpwrapper(user, passwd, host, port, dirs)
461 if not file: type = 'D'
462 else: type = 'I'
463 for attr in attrs:
464 attr, value = splitvalue(attr)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000465 if attr.lower() == 'type' and \
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000466 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000467 type = value.upper()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000468 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
469 if retrlen is not None and retrlen >= 0:
470 import mimetools, StringIO
471 headers = mimetools.Message(StringIO.StringIO(
472 'Content-Length: %d\n' % retrlen))
473 else:
474 headers = noheaders()
475 return addinfourl(fp, headers, "ftp:" + url)
476 except ftperrors(), msg:
477 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000478
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000479 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000480 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000481 # ignore POSTed data
482 #
483 # syntax of data URLs:
484 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
485 # mediatype := [ type "/" subtype ] *( ";" parameter )
486 # data := *urlchar
487 # parameter := attribute "=" value
488 import StringIO, mimetools, time
489 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000490 [type, data] = url.split(',', 1)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000491 except ValueError:
492 raise IOError, ('data error', 'bad data URL')
493 if not type:
494 type = 'text/plain;charset=US-ASCII'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000495 semi = type.rfind(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000496 if semi >= 0 and '=' not in type[semi:]:
497 encoding = type[semi+1:]
498 type = type[:semi]
499 else:
500 encoding = ''
501 msg = []
502 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
503 time.gmtime(time.time())))
504 msg.append('Content-type: %s' % type)
505 if encoding == 'base64':
506 import base64
507 data = base64.decodestring(data)
508 else:
509 data = unquote(data)
510 msg.append('Content-length: %d' % len(data))
511 msg.append('')
512 msg.append(data)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000513 msg = '\n'.join(msg)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000514 f = StringIO.StringIO(msg)
515 headers = mimetools.Message(f, 0)
516 f.fileno = None # needed for addinfourl
517 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000518
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000519
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000520class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000521 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000522
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000523 def __init__(self, *args):
524 apply(URLopener.__init__, (self,) + args)
525 self.auth_cache = {}
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000526 self.tries = 0
527 self.maxtries = 10
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000528
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000529 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000530 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000531 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000532
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000533 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000534 """Error 302 -- relocated (temporarily)."""
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000535 self.tries += 1
536 if self.maxtries and self.tries >= self.maxtries:
537 if hasattr(self, "http_error_500"):
538 meth = self.http_error_500
539 else:
540 meth = self.http_error_default
541 self.tries = 0
542 return meth(url, fp, 500,
543 "Internal Server Error: Redirect Recursion", headers)
544 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
545 data)
546 self.tries = 0
547 return result
548
549 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000550 if headers.has_key('location'):
551 newurl = headers['location']
552 elif headers.has_key('uri'):
553 newurl = headers['uri']
554 else:
555 return
556 void = fp.read()
557 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000558 # In case the server sent a relative URL, join with original:
Moshe Zadka5d87d472001-04-09 14:54:21 +0000559 newurl = basejoin(self.type + ":" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000560 if data is None:
561 return self.open(newurl)
562 else:
563 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000564
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000565 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000566 """Error 301 -- also relocated (permanently)."""
567 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000568
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000569 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000570 """Error 401 -- authentication required.
571 See this URL for a description of the basic authentication scheme:
572 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Moshe Zadkae99bd172001-02-27 06:27:04 +0000573 if not headers.has_key('www-authenticate'):
Tim Peters85ba6732001-02-28 08:26:44 +0000574 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000575 errmsg, headers)
576 stuff = headers['www-authenticate']
577 import re
578 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
579 if not match:
Tim Peters85ba6732001-02-28 08:26:44 +0000580 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000581 errcode, errmsg, headers)
582 scheme, realm = match.groups()
583 if scheme.lower() != 'basic':
Tim Peters85ba6732001-02-28 08:26:44 +0000584 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000585 errcode, errmsg, headers)
586 name = 'retry_' + self.type + '_basic_auth'
587 if data is None:
588 return getattr(self,name)(url, realm)
589 else:
590 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000591
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000592 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000593 host, selector = splithost(url)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000594 i = host.find('@') + 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000595 host = host[i:]
596 user, passwd = self.get_user_passwd(host, realm, i)
597 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000598 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000599 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000600 if data is None:
601 return self.open(newurl)
602 else:
603 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000604
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000605 def retry_https_basic_auth(self, url, realm, data=None):
Tim Peterse1190062001-01-15 03:34:38 +0000606 host, selector = splithost(url)
607 i = host.find('@') + 1
608 host = host[i:]
609 user, passwd = self.get_user_passwd(host, realm, i)
610 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000611 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Tim Peterse1190062001-01-15 03:34:38 +0000612 newurl = '//' + host + selector
Guido van Rossumafc4f042001-01-15 18:31:13 +0000613 return self.open_https(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000614
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000615 def get_user_passwd(self, host, realm, clear_cache = 0):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000616 key = realm + '@' + host.lower()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000617 if self.auth_cache.has_key(key):
618 if clear_cache:
619 del self.auth_cache[key]
620 else:
621 return self.auth_cache[key]
622 user, passwd = self.prompt_user_passwd(host, realm)
623 if user or passwd: self.auth_cache[key] = (user, passwd)
624 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000625
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000626 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000627 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000628 import getpass
629 try:
630 user = raw_input("Enter username for %s at %s: " % (realm,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000631 host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000632 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
633 (user, realm, host))
634 return user, passwd
635 except KeyboardInterrupt:
636 print
637 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000638
639
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000640# Utility functions
641
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000642_localhost = None
643def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000644 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000645 global _localhost
646 if not _localhost:
647 _localhost = socket.gethostbyname('localhost')
648 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000649
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000650_thishost = None
651def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000652 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000653 global _thishost
654 if not _thishost:
655 _thishost = socket.gethostbyname(socket.gethostname())
656 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000657
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000658_ftperrors = None
659def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000660 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000661 global _ftperrors
662 if not _ftperrors:
663 import ftplib
664 _ftperrors = ftplib.all_errors
665 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000666
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000667_noheaders = None
668def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000669 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000670 global _noheaders
671 if not _noheaders:
672 import mimetools
673 import StringIO
674 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
675 _noheaders.fp.close() # Recycle file descriptor
676 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000677
678
679# Utility classes
680
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000681class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000682 """Class used by open_ftp() for cache of open FTP connections."""
683
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000684 def __init__(self, user, passwd, host, port, dirs):
685 self.user = user
686 self.passwd = passwd
687 self.host = host
688 self.port = port
689 self.dirs = dirs
690 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000691
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000692 def init(self):
693 import ftplib
694 self.busy = 0
695 self.ftp = ftplib.FTP()
696 self.ftp.connect(self.host, self.port)
697 self.ftp.login(self.user, self.passwd)
698 for dir in self.dirs:
699 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000700
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000701 def retrfile(self, file, type):
702 import ftplib
703 self.endtransfer()
704 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
705 else: cmd = 'TYPE ' + type; isdir = 0
706 try:
707 self.ftp.voidcmd(cmd)
708 except ftplib.all_errors:
709 self.init()
710 self.ftp.voidcmd(cmd)
711 conn = None
712 if file and not isdir:
713 # Use nlst to see if the file exists at all
714 try:
715 self.ftp.nlst(file)
716 except ftplib.error_perm, reason:
717 raise IOError, ('ftp error', reason), sys.exc_info()[2]
718 # Restore the transfer mode!
719 self.ftp.voidcmd(cmd)
720 # Try to retrieve as a file
721 try:
722 cmd = 'RETR ' + file
723 conn = self.ftp.ntransfercmd(cmd)
724 except ftplib.error_perm, reason:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000725 if str(reason)[:3] != '550':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000726 raise IOError, ('ftp error', reason), sys.exc_info()[2]
727 if not conn:
728 # Set transfer mode to ASCII!
729 self.ftp.voidcmd('TYPE A')
730 # Try a directory listing
731 if file: cmd = 'LIST ' + file
732 else: cmd = 'LIST'
733 conn = self.ftp.ntransfercmd(cmd)
734 self.busy = 1
735 # Pass back both a suitably decorated object and a retrieval length
736 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000737 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000738 def endtransfer(self):
739 if not self.busy:
740 return
741 self.busy = 0
742 try:
743 self.ftp.voidresp()
744 except ftperrors():
745 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000746
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000747 def close(self):
748 self.endtransfer()
749 try:
750 self.ftp.close()
751 except ftperrors():
752 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000753
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000754class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000755 """Base class for addinfo and addclosehook."""
756
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000757 def __init__(self, fp):
758 self.fp = fp
759 self.read = self.fp.read
760 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000761 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
762 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000763
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000764 def __repr__(self):
765 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000766 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000767
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000768 def close(self):
769 self.read = None
770 self.readline = None
771 self.readlines = None
772 self.fileno = None
773 if self.fp: self.fp.close()
774 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000775
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000776class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000777 """Class to add a close hook to an open file."""
778
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000779 def __init__(self, fp, closehook, *hookargs):
780 addbase.__init__(self, fp)
781 self.closehook = closehook
782 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000783
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000784 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000785 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000786 if self.closehook:
787 apply(self.closehook, self.hookargs)
788 self.closehook = None
789 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000790
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000791class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000792 """class to add an info() method to an open file."""
793
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000794 def __init__(self, fp, headers):
795 addbase.__init__(self, fp)
796 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000797
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000798 def info(self):
799 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000800
Guido van Rossume6ad8911996-09-10 17:02:56 +0000801class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000802 """class to add info() and geturl() methods to an open file."""
803
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000804 def __init__(self, fp, headers, url):
805 addbase.__init__(self, fp)
806 self.headers = headers
807 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000808
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000809 def info(self):
810 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000811
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000812 def geturl(self):
813 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000814
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000815
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000816def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000817 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000818 type, path = splittype(url)
819 if type:
820 # if url is complete (i.e., it contains a type), return it
821 return url
822 host, path = splithost(path)
823 type, basepath = splittype(base) # inherit type from base
824 if host:
825 # if url contains host, just inherit type
826 if type: return type + '://' + host + path
827 else:
828 # no type inherited, so url must have started with //
829 # just return it
830 return url
831 host, basepath = splithost(basepath) # inherit host
Thomas Wouters7e474022000-07-16 12:04:32 +0000832 basepath, basetag = splittag(basepath) # remove extraneous cruft
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000833 basepath, basequery = splitquery(basepath) # idem
834 if path[:1] != '/':
835 # non-absolute path name
836 if path[:1] in ('#', '?'):
837 # path is just a tag or query, attach to basepath
838 i = len(basepath)
839 else:
840 # else replace last component
Guido van Rossumb2493f82000-12-15 15:01:37 +0000841 i = basepath.rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000842 if i < 0:
843 # basepath not absolute
844 if host:
845 # host present, make absolute
846 basepath = '/'
847 else:
848 # else keep non-absolute
849 basepath = ''
850 else:
851 # remove last file component
852 basepath = basepath[:i+1]
853 # Interpret ../ (important because of symlinks)
854 while basepath and path[:3] == '../':
855 path = path[3:]
Guido van Rossumb2493f82000-12-15 15:01:37 +0000856 i = basepath[:-1].rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000857 if i > 0:
858 basepath = basepath[:i+1]
859 elif i == 0:
860 basepath = '/'
861 break
862 else:
863 basepath = ''
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000864
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000865 path = basepath + path
Guido van Rossumb8bf3be2001-04-15 20:47:33 +0000866 if host and path and path[0] != '/':
867 path = '/' + path
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000868 if type and host: return type + '://' + host + path
869 elif type: return type + ':' + path
870 elif host: return '//' + host + path # don't know what this means
871 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000872
873
Guido van Rossum7c395db1994-07-04 22:14:49 +0000874# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000875# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000876# splittype('type:opaquestring') --> 'type', 'opaquestring'
877# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000878# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
879# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000880# splitport('host:port') --> 'host', 'port'
881# splitquery('/path?query') --> '/path', 'query'
882# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000883# splitattr('/path;attr1=value1;attr2=value2;...') ->
884# '/path', ['attr1=value1', 'attr2=value2', ...]
885# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000886# splitgophertype('/Xselector') --> 'X', 'selector'
887# unquote('abc%20def') -> 'abc def'
888# quote('abc def') -> 'abc%20def')
889
Martin v. Löwis1d994332000-12-03 18:30:10 +0000890def toBytes(url):
891 """toBytes(u"URL") --> 'URL'."""
892 # Most URL schemes require ASCII. If that changes, the conversion
893 # can be relaxed
894 if type(url) is types.UnicodeType:
895 try:
896 url = url.encode("ASCII")
897 except UnicodeError:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000898 raise UnicodeError("URL " + repr(url) +
899 " contains non-ASCII characters")
Martin v. Löwis1d994332000-12-03 18:30:10 +0000900 return url
901
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000902def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000903 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Guido van Rossumb2493f82000-12-15 15:01:37 +0000904 url = url.strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000905 if url[:1] == '<' and url[-1:] == '>':
Guido van Rossumb2493f82000-12-15 15:01:37 +0000906 url = url[1:-1].strip()
907 if url[:4] == 'URL:': url = url[4:].strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000908 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000909
Guido van Rossum332e1441997-09-29 23:23:46 +0000910_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000911def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000912 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000913 global _typeprog
914 if _typeprog is None:
915 import re
916 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000917
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000918 match = _typeprog.match(url)
919 if match:
920 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000921 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000922 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000923
Guido van Rossum332e1441997-09-29 23:23:46 +0000924_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000925def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000926 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000927 global _hostprog
928 if _hostprog is None:
929 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000930 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000931
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000932 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000933 if match: return match.group(1, 2)
934 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000935
Guido van Rossum332e1441997-09-29 23:23:46 +0000936_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000937def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000938 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000939 global _userprog
940 if _userprog is None:
941 import re
942 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000943
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000944 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000945 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000946 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000947
Guido van Rossum332e1441997-09-29 23:23:46 +0000948_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000949def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000950 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000951 global _passwdprog
952 if _passwdprog is None:
953 import re
954 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000955
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000956 match = _passwdprog.match(user)
957 if match: return match.group(1, 2)
958 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000959
Guido van Rossume7b146f2000-02-04 15:28:42 +0000960# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000961_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000962def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000963 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000964 global _portprog
965 if _portprog is None:
966 import re
967 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000968
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000969 match = _portprog.match(host)
970 if match: return match.group(1, 2)
971 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000972
Guido van Rossum332e1441997-09-29 23:23:46 +0000973_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000974def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000975 """Split host and port, returning numeric port.
976 Return given default port if no ':' found; defaults to -1.
977 Return numerical port if a valid number are found after ':'.
978 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000979 global _nportprog
980 if _nportprog is None:
981 import re
982 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000983
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000984 match = _nportprog.match(host)
985 if match:
986 host, port = match.group(1, 2)
987 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000988 if not port: raise ValueError, "no digits"
989 nport = int(port)
990 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000991 nport = None
992 return host, nport
993 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +0000994
Guido van Rossum332e1441997-09-29 23:23:46 +0000995_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000996def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000997 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000998 global _queryprog
999 if _queryprog is None:
1000 import re
1001 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001002
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001003 match = _queryprog.match(url)
1004 if match: return match.group(1, 2)
1005 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001006
Guido van Rossum332e1441997-09-29 23:23:46 +00001007_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001008def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001009 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001010 global _tagprog
1011 if _tagprog is None:
1012 import re
1013 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001014
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001015 match = _tagprog.match(url)
1016 if match: return match.group(1, 2)
1017 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001018
Guido van Rossum7c395db1994-07-04 22:14:49 +00001019def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001020 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1021 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Guido van Rossumb2493f82000-12-15 15:01:37 +00001022 words = url.split(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001023 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +00001024
Guido van Rossum332e1441997-09-29 23:23:46 +00001025_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001026def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001027 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001028 global _valueprog
1029 if _valueprog is None:
1030 import re
1031 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001032
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001033 match = _valueprog.match(attr)
1034 if match: return match.group(1, 2)
1035 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001036
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001037def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001038 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001039 if selector[:1] == '/' and selector[1:2]:
1040 return selector[1], selector[2:]
1041 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001042
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001043def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001044 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001045 mychr = chr
Guido van Rossumb2493f82000-12-15 15:01:37 +00001046 myatoi = int
1047 list = s.split('%')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001048 res = [list[0]]
1049 myappend = res.append
1050 del list[0]
1051 for item in list:
1052 if item[1:2]:
1053 try:
1054 myappend(mychr(myatoi(item[:2], 16))
1055 + item[2:])
1056 except:
1057 myappend('%' + item)
1058 else:
1059 myappend('%' + item)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001060 return "".join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001061
Guido van Rossum0564e121996-12-13 14:47:36 +00001062def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001063 """unquote('%7e/abc+def') -> '~/abc def'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001064 if '+' in s:
1065 # replace '+' with ' '
Guido van Rossumb2493f82000-12-15 15:01:37 +00001066 s = ' '.join(s.split('+'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001067 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001068
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001069always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001070 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001071 '0123456789' '_.-')
1072
1073_fast_safe_test = always_safe + '/'
1074_fast_safe = None
1075
1076def _fast_quote(s):
1077 global _fast_safe
1078 if _fast_safe is None:
1079 _fast_safe = {}
1080 for c in _fast_safe_test:
1081 _fast_safe[c] = c
1082 res = list(s)
1083 for i in range(len(res)):
1084 c = res[i]
1085 if not _fast_safe.has_key(c):
Guido van Rossume27a7b82001-01-19 03:28:15 +00001086 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001087 return ''.join(res)
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001088
Guido van Rossum7c395db1994-07-04 22:14:49 +00001089def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001090 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001091
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001092 Each part of a URL, e.g. the path info, the query, etc., has a
1093 different set of reserved characters that must be quoted.
1094
1095 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1096 the following reserved characters.
1097
1098 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1099 "$" | ","
1100
1101 Each of these characters is reserved in some component of a URL,
1102 but not necessarily in all of them.
1103
1104 By default, the quote function is intended for quoting the path
1105 section of a URL. Thus, it will not encode '/'. This character
1106 is reserved, but in typical usage the quote function is being
1107 called on a path where the existing slash characters are used as
1108 reserved characters.
1109 """
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001110 safe = always_safe + safe
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001111 if _fast_safe_test == safe:
1112 return _fast_quote(s)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001113 res = list(s)
1114 for i in range(len(res)):
1115 c = res[i]
1116 if c not in safe:
Guido van Rossume27a7b82001-01-19 03:28:15 +00001117 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001118 return ''.join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001119
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001120def quote_plus(s, safe = ''):
1121 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001122 if ' ' in s:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001123 l = s.split(' ')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001124 for i in range(len(l)):
1125 l[i] = quote(l[i], safe)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001126 return '+'.join(l)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001127 else:
1128 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001129
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001130def urlencode(query,doseq=0):
1131 """Encode a sequence of two-element tuples or dictionary into a URL query string.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001132
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001133 If any values in the query arg are sequences and doseq is true, each
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001134 sequence element is converted to a separate parameter.
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001135
1136 If the query arg is a sequence of two-element tuples, the order of the
1137 parameters in the output will match the order of parameters in the
1138 input.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001139 """
Tim Peters658cba62001-02-09 20:06:00 +00001140
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001141 if hasattr(query,"items"):
1142 # mapping objects
1143 query = query.items()
1144 else:
1145 # it's a bother at times that strings and string-like objects are
1146 # sequences...
1147 try:
1148 # non-sequence items should not work with len()
1149 x = len(query)
1150 # non-empty strings will fail this
1151 if len(query) and type(query[0]) != types.TupleType:
1152 raise TypeError
1153 # zero-length sequences of all types will get here and succeed,
1154 # but that's a minor nit - since the original implementation
1155 # allowed empty dicts that type of behavior probably should be
1156 # preserved for consistency
1157 except TypeError:
1158 ty,va,tb = sys.exc_info()
1159 raise TypeError, "not a valid non-string sequence or mapping object", tb
1160
Guido van Rossume7b146f2000-02-04 15:28:42 +00001161 l = []
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001162 if not doseq:
1163 # preserve old behavior
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001164 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001165 k = quote_plus(str(k))
1166 v = quote_plus(str(v))
1167 l.append(k + '=' + v)
1168 else:
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001169 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001170 k = quote_plus(str(k))
1171 if type(v) == types.StringType:
1172 v = quote_plus(v)
1173 l.append(k + '=' + v)
1174 elif type(v) == types.UnicodeType:
1175 # is there a reasonable way to convert to ASCII?
1176 # encode generates a string, but "replace" or "ignore"
1177 # lose information and "strict" can raise UnicodeError
1178 v = quote_plus(v.encode("ASCII","replace"))
1179 l.append(k + '=' + v)
1180 else:
1181 try:
1182 # is this a sufficient test for sequence-ness?
1183 x = len(v)
1184 except TypeError:
1185 # not a sequence
1186 v = quote_plus(str(v))
1187 l.append(k + '=' + v)
1188 else:
1189 # loop over the sequence
1190 for elt in v:
1191 l.append(k + '=' + quote_plus(str(elt)))
Guido van Rossumb2493f82000-12-15 15:01:37 +00001192 return '&'.join(l)
Guido van Rossum810a3391998-07-22 21:33:23 +00001193
Guido van Rossum442e7201996-03-20 15:33:11 +00001194# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001195def getproxies_environment():
1196 """Return a dictionary of scheme -> proxy server URL mappings.
1197
1198 Scan the environment for variables named <scheme>_proxy;
1199 this seems to be the standard convention. If you need a
1200 different way, you can pass a proxies dictionary to the
1201 [Fancy]URLopener constructor.
1202
1203 """
1204 proxies = {}
1205 for name, value in os.environ.items():
Guido van Rossumb2493f82000-12-15 15:01:37 +00001206 name = name.lower()
Mark Hammond4f570b92000-07-26 07:04:38 +00001207 if value and name[-6:] == '_proxy':
1208 proxies[name[:-6]] = value
1209 return proxies
1210
Guido van Rossum4163e701998-08-06 13:39:09 +00001211if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001212 def getproxies():
1213 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001214
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001215 By convention the mac uses Internet Config to store
1216 proxies. An HTTP proxy, for instance, is stored under
1217 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001218
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001219 """
1220 try:
1221 import ic
1222 except ImportError:
1223 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001224
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001225 try:
1226 config = ic.IC()
1227 except ic.error:
1228 return {}
1229 proxies = {}
1230 # HTTP:
1231 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1232 try:
1233 value = config['HTTPProxyHost']
1234 except ic.error:
1235 pass
1236 else:
1237 proxies['http'] = 'http://%s' % value
1238 # FTP: XXXX To be done.
1239 # Gopher: XXXX To be done.
1240 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001241
1242elif os.name == 'nt':
1243 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001244 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001245
1246 Win32 uses the registry to store proxies.
1247
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001248 """
1249 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001250 try:
1251 import _winreg
1252 except ImportError:
1253 # Std module, so should be around - but you never know!
1254 return proxies
1255 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001256 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1257 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001258 proxyEnable = _winreg.QueryValueEx(internetSettings,
1259 'ProxyEnable')[0]
1260 if proxyEnable:
1261 # Returned as Unicode but problems if not converted to ASCII
1262 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1263 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001264 if '=' in proxyServer:
1265 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001266 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001267 protocol, address = p.split('=', 1)
Mark Hammond4f570b92000-07-26 07:04:38 +00001268 proxies[protocol] = '%s://%s' % (protocol, address)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001269 else:
1270 # Use one setting for all protocols
1271 if proxyServer[:5] == 'http:':
1272 proxies['http'] = proxyServer
1273 else:
1274 proxies['http'] = 'http://%s' % proxyServer
1275 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001276 internetSettings.Close()
1277 except (WindowsError, ValueError, TypeError):
1278 # Either registry key not found etc, or the value in an
1279 # unexpected format.
1280 # proxies already set up to be empty so nothing to do
1281 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001282 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001283
Mark Hammond4f570b92000-07-26 07:04:38 +00001284 def getproxies():
1285 """Return a dictionary of scheme -> proxy server URL mappings.
1286
1287 Returns settings gathered from the environment, if specified,
1288 or the registry.
1289
1290 """
1291 return getproxies_environment() or getproxies_registry()
1292else:
1293 # By default use environment variables
1294 getproxies = getproxies_environment
1295
Guido van Rossum442e7201996-03-20 15:33:11 +00001296
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001297# Test and time quote() and unquote()
1298def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001299 import time
1300 s = ''
1301 for i in range(256): s = s + chr(i)
1302 s = s*4
1303 t0 = time.time()
1304 qs = quote(s)
1305 uqs = unquote(qs)
1306 t1 = time.time()
1307 if uqs != s:
1308 print 'Wrong!'
1309 print `s`
1310 print `qs`
1311 print `uqs`
1312 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001313
1314
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001315def reporthook(blocknum, blocksize, totalsize):
1316 # Report during remote transfers
Guido van Rossumb2493f82000-12-15 15:01:37 +00001317 print "Block number: %d, Block size: %d, Total size: %d" % (
1318 blocknum, blocksize, totalsize)
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001319
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001320# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001321def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001322 if not args:
1323 args = [
1324 '/etc/passwd',
1325 'file:/etc/passwd',
1326 'file://localhost/etc/passwd',
1327 'ftp://ftp.python.org/etc/passwd',
1328## 'gopher://gopher.micro.umn.edu/1/',
1329 'http://www.python.org/index.html',
1330 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001331 if hasattr(URLopener, "open_https"):
1332 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001333 try:
1334 for url in args:
1335 print '-'*10, url, '-'*10
1336 fn, h = urlretrieve(url, None, reporthook)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001337 print fn
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001338 if h:
1339 print '======'
1340 for k in h.keys(): print k + ':', h[k]
1341 print '======'
1342 fp = open(fn, 'rb')
1343 data = fp.read()
1344 del fp
1345 if '\r' in data:
1346 table = string.maketrans("", "")
Guido van Rossumb2493f82000-12-15 15:01:37 +00001347 data = data.translate(table, "\r")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001348 print data
1349 fn, h = None, None
1350 print '-'*40
1351 finally:
1352 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001353
Guido van Rossum23490151998-06-25 02:39:00 +00001354def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001355 import getopt, sys
1356 try:
1357 opts, args = getopt.getopt(sys.argv[1:], "th")
1358 except getopt.error, msg:
1359 print msg
1360 print "Use -h for help"
1361 return
1362 t = 0
1363 for o, a in opts:
1364 if o == '-t':
1365 t = t + 1
1366 if o == '-h':
1367 print "Usage: python urllib.py [-t] [url ...]"
1368 print "-t runs self-test;",
1369 print "otherwise, contents of urls are printed"
1370 return
1371 if t:
1372 if t > 1:
1373 test1()
1374 test(args)
1375 else:
1376 if not args:
1377 print "Use -h for help"
1378 for url in args:
1379 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001380
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001381# Run test program when run as a script
1382if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001383 main()