blob: 3175199a96ab6eae7721d078cbd4d7afe1ba47a3 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossum3c8484e1996-11-20 22:02:24 +000028import sys
Martin v. Löwis1d994332000-12-03 18:30:10 +000029import types
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000030
Skip Montanaro40fc1602001-03-01 04:27:19 +000031__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
Skip Montanaro44d5e0c2001-03-13 19:47:16 +000033 "urlencode", "url2pathname", "pathname2url", "splittag",
34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36 "splitnport", "splitquery", "splitattr", "splitvalue",
37 "splitgophertype", "getproxies"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000038
Guido van Rossumb2493f82000-12-15 15:01:37 +000039__version__ = '1.15' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000040
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000041MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000042
Jack Jansendc3e3f61995-12-15 13:22:13 +000043# Helper for non-unix systems
44if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000045 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000046elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000047 from nturl2path import url2pathname, pathname2url
Guido van Rossumd74fb6b2001-03-02 06:43:49 +000048elif os.name == 'riscos':
49 from rourl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000050else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000051 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000052 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000053 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000054 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000055
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000056# This really consists of two pieces:
57# (1) a class which handles opening of all sorts of URLs
58# (plus assorted utilities etc.)
59# (2) a set of functions for parsing URLs
60# XXX Should these be separated out into different modules?
61
62
63# Shortcut for basic usage
64_urlopener = None
Guido van Rossumbd013741996-12-10 16:00:28 +000065def urlopen(url, data=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000066 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000067 global _urlopener
68 if not _urlopener:
69 _urlopener = FancyURLopener()
70 if data is None:
71 return _urlopener.open(url)
72 else:
73 return _urlopener.open(url, data)
Fred Drake316a7932000-08-24 01:01:26 +000074def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000075 global _urlopener
76 if not _urlopener:
77 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000078 return _urlopener.retrieve(url, filename, reporthook, data)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000079def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000080 if _urlopener:
81 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000082
83
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000084ftpcache = {}
85class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000086 """Class to open URLs.
87 This is a class rather than just a subroutine because we may need
88 more than one set of global protocol-specific options.
89 Note -- this is a base class for those who don't want the
90 automatic handling of errors type 302 (relocated) and 401
91 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000092
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000093 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000094
Guido van Rossumba311382000-08-24 16:18:04 +000095 version = "Python-urllib/%s" % __version__
96
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000097 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000098 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000099 if proxies is None:
100 proxies = getproxies()
101 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
102 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000103 self.key_file = x509.get('key_file')
104 self.cert_file = x509.get('cert_file')
Guido van Rossumba311382000-08-24 16:18:04 +0000105 self.addheaders = [('User-agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000106 self.__tempfiles = []
107 self.__unlink = os.unlink # See cleanup()
108 self.tempcache = None
109 # Undocumented feature: if you assign {} to tempcache,
110 # it is used to cache files retrieved with
111 # self.retrieve(). This is not enabled by default
112 # since it does not work for changing documents (and I
113 # haven't got the logic to check expiration headers
114 # yet).
115 self.ftpcache = ftpcache
116 # Undocumented feature: you can use a different
117 # ftp cache by assigning to the .ftpcache member;
118 # in case you want logically independent URL openers
119 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000120
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000121 def __del__(self):
122 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000123
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000124 def close(self):
125 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000126
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000127 def cleanup(self):
128 # This code sometimes runs when the rest of this module
129 # has already been deleted, so it can't use any globals
130 # or import anything.
131 if self.__tempfiles:
132 for file in self.__tempfiles:
133 try:
134 self.__unlink(file)
135 except:
136 pass
137 del self.__tempfiles[:]
138 if self.tempcache:
139 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000140
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000141 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000142 """Add a header to be used by the HTTP interface only
143 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000144 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000145
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000146 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000147 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000148 """Use URLopener().open(file) instead of open(file, 'r')."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000149 fullurl = unwrap(toBytes(fullurl))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000150 if self.tempcache and self.tempcache.has_key(fullurl):
151 filename, headers = self.tempcache[fullurl]
152 fp = open(filename, 'rb')
153 return addinfourl(fp, headers, fullurl)
Martin v. Löwis1d994332000-12-03 18:30:10 +0000154 urltype, url = splittype(fullurl)
155 if not urltype:
156 urltype = 'file'
157 if self.proxies.has_key(urltype):
158 proxy = self.proxies[urltype]
159 urltype, proxyhost = splittype(proxy)
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000160 host, selector = splithost(proxyhost)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000161 url = (host, fullurl) # Signal special case to open_*()
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000162 else:
163 proxy = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000164 name = 'open_' + urltype
165 self.type = urltype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000166 if '-' in name:
167 # replace - with _
Guido van Rossumb2493f82000-12-15 15:01:37 +0000168 name = '_'.join(name.split('-'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000169 if not hasattr(self, name):
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000170 if proxy:
171 return self.open_unknown_proxy(proxy, fullurl, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000172 else:
173 return self.open_unknown(fullurl, data)
174 try:
175 if data is None:
176 return getattr(self, name)(url)
177 else:
178 return getattr(self, name)(url, data)
179 except socket.error, msg:
180 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000181
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000182 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000183 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000184 type, url = splittype(fullurl)
185 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000186
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000187 def open_unknown_proxy(self, proxy, fullurl, data=None):
188 """Overridable interface to open unknown URL type."""
189 type, url = splittype(fullurl)
190 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
191
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000192 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000193 def retrieve(self, url, filename=None, reporthook=None, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000194 """retrieve(url) returns (filename, None) for a local object
195 or (tempfilename, headers) for a remote object."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000196 url = unwrap(toBytes(url))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000197 if self.tempcache and self.tempcache.has_key(url):
198 return self.tempcache[url]
199 type, url1 = splittype(url)
200 if not filename and (not type or type == 'file'):
201 try:
202 fp = self.open_local_file(url1)
203 hdrs = fp.info()
204 del fp
205 return url2pathname(splithost(url1)[1]), hdrs
206 except IOError, msg:
207 pass
Fred Drake316a7932000-08-24 01:01:26 +0000208 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000209 headers = fp.info()
210 if not filename:
211 import tempfile
212 garbage, path = splittype(url)
213 garbage, path = splithost(path or "")
214 path, garbage = splitquery(path or "")
215 path, garbage = splitattr(path or "")
216 suffix = os.path.splitext(path)[1]
217 filename = tempfile.mktemp(suffix)
218 self.__tempfiles.append(filename)
219 result = filename, headers
220 if self.tempcache is not None:
221 self.tempcache[url] = result
222 tfp = open(filename, 'wb')
223 bs = 1024*8
224 size = -1
225 blocknum = 1
226 if reporthook:
227 if headers.has_key("content-length"):
228 size = int(headers["Content-Length"])
229 reporthook(0, bs, size)
230 block = fp.read(bs)
231 if reporthook:
232 reporthook(1, bs, size)
233 while block:
234 tfp.write(block)
235 block = fp.read(bs)
236 blocknum = blocknum + 1
237 if reporthook:
238 reporthook(blocknum, bs, size)
239 fp.close()
240 tfp.close()
241 del fp
242 del tfp
243 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000244
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000245 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000246
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000247 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000248 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000249 import httplib
250 user_passwd = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000251 if type(url) is types.StringType:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000252 host, selector = splithost(url)
253 if host:
254 user_passwd, host = splituser(host)
255 host = unquote(host)
256 realhost = host
257 else:
258 host, selector = url
259 urltype, rest = splittype(selector)
260 url = rest
261 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000262 if urltype.lower() != 'http':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000263 realhost = None
264 else:
265 realhost, rest = splithost(rest)
266 if realhost:
267 user_passwd, realhost = splituser(realhost)
268 if user_passwd:
269 selector = "%s://%s%s" % (urltype, realhost, rest)
270 #print "proxy via http:", host, selector
271 if not host: raise IOError, ('http error', 'no host given')
272 if user_passwd:
273 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000274 auth = base64.encodestring(user_passwd).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000275 else:
276 auth = None
277 h = httplib.HTTP(host)
278 if data is not None:
279 h.putrequest('POST', selector)
280 h.putheader('Content-type', 'application/x-www-form-urlencoded')
281 h.putheader('Content-length', '%d' % len(data))
282 else:
283 h.putrequest('GET', selector)
284 if auth: h.putheader('Authorization', 'Basic %s' % auth)
285 if realhost: h.putheader('Host', realhost)
286 for args in self.addheaders: apply(h.putheader, args)
287 h.endheaders()
288 if data is not None:
289 h.send(data + '\r\n')
290 errcode, errmsg, headers = h.getreply()
291 fp = h.getfile()
292 if errcode == 200:
293 return addinfourl(fp, headers, "http:" + url)
294 else:
295 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000296 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000297 else:
298 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000299
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000300 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000301 """Handle http errors.
302 Derived class can override this, or provide specific handlers
303 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000304 # First check if there's a specific handler for this error
305 name = 'http_error_%d' % errcode
306 if hasattr(self, name):
307 method = getattr(self, name)
308 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000309 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000310 else:
311 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000312 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000313 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000314
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000315 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000316 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000317 void = fp.read()
318 fp.close()
319 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000320
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000321 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000322 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000323 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000324 import httplib
Fred Drake567ca8e2000-08-21 21:42:42 +0000325 user_passwd = None
Moshe Zadkab2a0a832001-01-08 07:09:25 +0000326 if type(url) is types.StringType:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000327 host, selector = splithost(url)
Fred Drake567ca8e2000-08-21 21:42:42 +0000328 if host:
329 user_passwd, host = splituser(host)
330 host = unquote(host)
331 realhost = host
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000332 else:
333 host, selector = url
334 urltype, rest = splittype(selector)
Fred Drake567ca8e2000-08-21 21:42:42 +0000335 url = rest
336 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000337 if urltype.lower() != 'https':
Fred Drake567ca8e2000-08-21 21:42:42 +0000338 realhost = None
339 else:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000340 realhost, rest = splithost(rest)
Fred Drake567ca8e2000-08-21 21:42:42 +0000341 if realhost:
342 user_passwd, realhost = splituser(realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000343 if user_passwd:
344 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000345 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000346 if not host: raise IOError, ('https error', 'no host given')
347 if user_passwd:
348 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000349 auth = base64.encodestring(user_passwd).strip()
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000350 else:
351 auth = None
352 h = httplib.HTTPS(host, 0,
353 key_file=self.key_file,
354 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000355 if data is not None:
356 h.putrequest('POST', selector)
357 h.putheader('Content-type',
358 'application/x-www-form-urlencoded')
359 h.putheader('Content-length', '%d' % len(data))
360 else:
361 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000362 if auth: h.putheader('Authorization: Basic %s' % auth)
Fred Drake567ca8e2000-08-21 21:42:42 +0000363 if realhost: h.putheader('Host', realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000364 for args in self.addheaders: apply(h.putheader, args)
365 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000366 if data is not None:
367 h.send(data + '\r\n')
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000368 errcode, errmsg, headers = h.getreply()
369 fp = h.getfile()
370 if errcode == 200:
371 return addinfourl(fp, headers, url)
372 else:
Fred Drake567ca8e2000-08-21 21:42:42 +0000373 if data is None:
374 return self.http_error(url, fp, errcode, errmsg, headers)
375 else:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000376 return self.http_error(url, fp, errcode, errmsg, headers,
377 data)
Fred Drake567ca8e2000-08-21 21:42:42 +0000378
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000379 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000380 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000381 import gopherlib
382 host, selector = splithost(url)
383 if not host: raise IOError, ('gopher error', 'no host given')
384 host = unquote(host)
385 type, selector = splitgophertype(selector)
386 selector, query = splitquery(selector)
387 selector = unquote(selector)
388 if query:
389 query = unquote(query)
390 fp = gopherlib.send_query(selector, query, host)
391 else:
392 fp = gopherlib.send_selector(selector, host)
393 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000394
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000395 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000396 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000397 if url[:2] == '//' and url[2:3] != '/':
398 return self.open_ftp(url)
399 else:
400 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000401
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000402 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000403 """Use local file."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000404 import mimetypes, mimetools, StringIO
405 mtype = mimetypes.guess_type(url)[0]
406 headers = mimetools.Message(StringIO.StringIO(
407 'Content-Type: %s\n' % (mtype or 'text/plain')))
408 host, file = splithost(url)
409 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000410 urlfile = file
411 if file[:1] == '/':
412 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000413 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000414 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000415 host, port = splitport(host)
416 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000417 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000418 urlfile = file
419 if file[:1] == '/':
420 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000421 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000422 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000423 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000424
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000425 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000426 """Use FTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000427 host, path = splithost(url)
428 if not host: raise IOError, ('ftp error', 'no host given')
429 host, port = splitport(host)
430 user, host = splituser(host)
431 if user: user, passwd = splitpasswd(user)
432 else: passwd = None
433 host = unquote(host)
434 user = unquote(user or '')
435 passwd = unquote(passwd or '')
436 host = socket.gethostbyname(host)
437 if not port:
438 import ftplib
439 port = ftplib.FTP_PORT
440 else:
441 port = int(port)
442 path, attrs = splitattr(path)
443 path = unquote(path)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000444 dirs = path.split('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000445 dirs, file = dirs[:-1], dirs[-1]
446 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000447 if dirs and not dirs[0]: dirs[0] = '/'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000448 key = user, host, port, '/'.join(dirs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000449 # XXX thread unsafe!
450 if len(self.ftpcache) > MAXFTPCACHE:
451 # Prune the cache, rather arbitrarily
452 for k in self.ftpcache.keys():
453 if k != key:
454 v = self.ftpcache[k]
455 del self.ftpcache[k]
456 v.close()
457 try:
458 if not self.ftpcache.has_key(key):
459 self.ftpcache[key] = \
460 ftpwrapper(user, passwd, host, port, dirs)
461 if not file: type = 'D'
462 else: type = 'I'
463 for attr in attrs:
464 attr, value = splitvalue(attr)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000465 if attr.lower() == 'type' and \
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000466 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000467 type = value.upper()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000468 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
469 if retrlen is not None and retrlen >= 0:
470 import mimetools, StringIO
471 headers = mimetools.Message(StringIO.StringIO(
472 'Content-Length: %d\n' % retrlen))
473 else:
474 headers = noheaders()
475 return addinfourl(fp, headers, "ftp:" + url)
476 except ftperrors(), msg:
477 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000478
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000479 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000480 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000481 # ignore POSTed data
482 #
483 # syntax of data URLs:
484 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
485 # mediatype := [ type "/" subtype ] *( ";" parameter )
486 # data := *urlchar
487 # parameter := attribute "=" value
488 import StringIO, mimetools, time
489 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000490 [type, data] = url.split(',', 1)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000491 except ValueError:
492 raise IOError, ('data error', 'bad data URL')
493 if not type:
494 type = 'text/plain;charset=US-ASCII'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000495 semi = type.rfind(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000496 if semi >= 0 and '=' not in type[semi:]:
497 encoding = type[semi+1:]
498 type = type[:semi]
499 else:
500 encoding = ''
501 msg = []
502 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
503 time.gmtime(time.time())))
504 msg.append('Content-type: %s' % type)
505 if encoding == 'base64':
506 import base64
507 data = base64.decodestring(data)
508 else:
509 data = unquote(data)
510 msg.append('Content-length: %d' % len(data))
511 msg.append('')
512 msg.append(data)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000513 msg = '\n'.join(msg)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000514 f = StringIO.StringIO(msg)
515 headers = mimetools.Message(f, 0)
516 f.fileno = None # needed for addinfourl
517 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000518
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000519
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000520class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000521 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000522
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000523 def __init__(self, *args):
524 apply(URLopener.__init__, (self,) + args)
525 self.auth_cache = {}
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000526 self.tries = 0
527 self.maxtries = 10
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000528
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000529 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000530 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000531 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000532
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000533 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000534 """Error 302 -- relocated (temporarily)."""
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000535 self.tries += 1
536 if self.maxtries and self.tries >= self.maxtries:
537 if hasattr(self, "http_error_500"):
538 meth = self.http_error_500
539 else:
540 meth = self.http_error_default
541 self.tries = 0
542 return meth(url, fp, 500,
543 "Internal Server Error: Redirect Recursion", headers)
544 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
545 data)
546 self.tries = 0
547 return result
548
549 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000550 if headers.has_key('location'):
551 newurl = headers['location']
552 elif headers.has_key('uri'):
553 newurl = headers['uri']
554 else:
555 return
556 void = fp.read()
557 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000558 # In case the server sent a relative URL, join with original:
Moshe Zadka5d87d472001-04-09 14:54:21 +0000559 newurl = basejoin(self.type + ":" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000560 if data is None:
561 return self.open(newurl)
562 else:
563 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000564
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000565 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000566 """Error 301 -- also relocated (permanently)."""
567 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000568
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000569 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000570 """Error 401 -- authentication required.
571 See this URL for a description of the basic authentication scheme:
572 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Moshe Zadkae99bd172001-02-27 06:27:04 +0000573 if not headers.has_key('www-authenticate'):
Tim Peters85ba6732001-02-28 08:26:44 +0000574 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000575 errmsg, headers)
576 stuff = headers['www-authenticate']
577 import re
578 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
579 if not match:
Tim Peters85ba6732001-02-28 08:26:44 +0000580 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000581 errcode, errmsg, headers)
582 scheme, realm = match.groups()
583 if scheme.lower() != 'basic':
Tim Peters85ba6732001-02-28 08:26:44 +0000584 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000585 errcode, errmsg, headers)
586 name = 'retry_' + self.type + '_basic_auth'
587 if data is None:
588 return getattr(self,name)(url, realm)
589 else:
590 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000591
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000592 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000593 host, selector = splithost(url)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000594 i = host.find('@') + 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000595 host = host[i:]
596 user, passwd = self.get_user_passwd(host, realm, i)
597 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000598 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000599 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000600 if data is None:
601 return self.open(newurl)
602 else:
603 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000604
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000605 def retry_https_basic_auth(self, url, realm, data=None):
Tim Peterse1190062001-01-15 03:34:38 +0000606 host, selector = splithost(url)
607 i = host.find('@') + 1
608 host = host[i:]
609 user, passwd = self.get_user_passwd(host, realm, i)
610 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000611 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Tim Peterse1190062001-01-15 03:34:38 +0000612 newurl = '//' + host + selector
Guido van Rossumafc4f042001-01-15 18:31:13 +0000613 return self.open_https(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000614
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000615 def get_user_passwd(self, host, realm, clear_cache = 0):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000616 key = realm + '@' + host.lower()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000617 if self.auth_cache.has_key(key):
618 if clear_cache:
619 del self.auth_cache[key]
620 else:
621 return self.auth_cache[key]
622 user, passwd = self.prompt_user_passwd(host, realm)
623 if user or passwd: self.auth_cache[key] = (user, passwd)
624 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000625
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000626 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000627 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000628 import getpass
629 try:
630 user = raw_input("Enter username for %s at %s: " % (realm,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000631 host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000632 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
633 (user, realm, host))
634 return user, passwd
635 except KeyboardInterrupt:
636 print
637 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000638
639
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000640# Utility functions
641
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000642_localhost = None
643def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000644 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000645 global _localhost
646 if not _localhost:
647 _localhost = socket.gethostbyname('localhost')
648 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000649
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000650_thishost = None
651def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000652 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000653 global _thishost
654 if not _thishost:
655 _thishost = socket.gethostbyname(socket.gethostname())
656 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000657
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000658_ftperrors = None
659def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000660 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000661 global _ftperrors
662 if not _ftperrors:
663 import ftplib
664 _ftperrors = ftplib.all_errors
665 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000666
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000667_noheaders = None
668def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000669 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000670 global _noheaders
671 if not _noheaders:
672 import mimetools
673 import StringIO
674 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
675 _noheaders.fp.close() # Recycle file descriptor
676 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000677
678
679# Utility classes
680
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000681class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000682 """Class used by open_ftp() for cache of open FTP connections."""
683
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000684 def __init__(self, user, passwd, host, port, dirs):
685 self.user = user
686 self.passwd = passwd
687 self.host = host
688 self.port = port
689 self.dirs = dirs
690 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000691
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000692 def init(self):
693 import ftplib
694 self.busy = 0
695 self.ftp = ftplib.FTP()
696 self.ftp.connect(self.host, self.port)
697 self.ftp.login(self.user, self.passwd)
698 for dir in self.dirs:
699 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000700
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000701 def retrfile(self, file, type):
702 import ftplib
703 self.endtransfer()
704 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
705 else: cmd = 'TYPE ' + type; isdir = 0
706 try:
707 self.ftp.voidcmd(cmd)
708 except ftplib.all_errors:
709 self.init()
710 self.ftp.voidcmd(cmd)
711 conn = None
712 if file and not isdir:
713 # Use nlst to see if the file exists at all
714 try:
715 self.ftp.nlst(file)
716 except ftplib.error_perm, reason:
717 raise IOError, ('ftp error', reason), sys.exc_info()[2]
718 # Restore the transfer mode!
719 self.ftp.voidcmd(cmd)
720 # Try to retrieve as a file
721 try:
722 cmd = 'RETR ' + file
723 conn = self.ftp.ntransfercmd(cmd)
724 except ftplib.error_perm, reason:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000725 if str(reason)[:3] != '550':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000726 raise IOError, ('ftp error', reason), sys.exc_info()[2]
727 if not conn:
728 # Set transfer mode to ASCII!
729 self.ftp.voidcmd('TYPE A')
730 # Try a directory listing
731 if file: cmd = 'LIST ' + file
732 else: cmd = 'LIST'
733 conn = self.ftp.ntransfercmd(cmd)
734 self.busy = 1
735 # Pass back both a suitably decorated object and a retrieval length
736 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000737 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000738 def endtransfer(self):
739 if not self.busy:
740 return
741 self.busy = 0
742 try:
743 self.ftp.voidresp()
744 except ftperrors():
745 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000746
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000747 def close(self):
748 self.endtransfer()
749 try:
750 self.ftp.close()
751 except ftperrors():
752 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000753
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000754class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000755 """Base class for addinfo and addclosehook."""
756
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000757 def __init__(self, fp):
758 self.fp = fp
759 self.read = self.fp.read
760 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000761 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
762 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000763
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000764 def __repr__(self):
765 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000766 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000767
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000768 def close(self):
769 self.read = None
770 self.readline = None
771 self.readlines = None
772 self.fileno = None
773 if self.fp: self.fp.close()
774 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000775
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000776class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000777 """Class to add a close hook to an open file."""
778
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000779 def __init__(self, fp, closehook, *hookargs):
780 addbase.__init__(self, fp)
781 self.closehook = closehook
782 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000783
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000784 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000785 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000786 if self.closehook:
787 apply(self.closehook, self.hookargs)
788 self.closehook = None
789 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000790
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000791class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000792 """class to add an info() method to an open file."""
793
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000794 def __init__(self, fp, headers):
795 addbase.__init__(self, fp)
796 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000797
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000798 def info(self):
799 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000800
Guido van Rossume6ad8911996-09-10 17:02:56 +0000801class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000802 """class to add info() and geturl() methods to an open file."""
803
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000804 def __init__(self, fp, headers, url):
805 addbase.__init__(self, fp)
806 self.headers = headers
807 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000808
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000809 def info(self):
810 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000811
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000812 def geturl(self):
813 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000814
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000815
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000816def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000817 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000818 type, path = splittype(url)
819 if type:
820 # if url is complete (i.e., it contains a type), return it
821 return url
822 host, path = splithost(path)
823 type, basepath = splittype(base) # inherit type from base
824 if host:
825 # if url contains host, just inherit type
826 if type: return type + '://' + host + path
827 else:
828 # no type inherited, so url must have started with //
829 # just return it
830 return url
831 host, basepath = splithost(basepath) # inherit host
Thomas Wouters7e474022000-07-16 12:04:32 +0000832 basepath, basetag = splittag(basepath) # remove extraneous cruft
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000833 basepath, basequery = splitquery(basepath) # idem
834 if path[:1] != '/':
835 # non-absolute path name
836 if path[:1] in ('#', '?'):
837 # path is just a tag or query, attach to basepath
838 i = len(basepath)
839 else:
840 # else replace last component
Guido van Rossumb2493f82000-12-15 15:01:37 +0000841 i = basepath.rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000842 if i < 0:
843 # basepath not absolute
844 if host:
845 # host present, make absolute
846 basepath = '/'
847 else:
848 # else keep non-absolute
849 basepath = ''
850 else:
851 # remove last file component
852 basepath = basepath[:i+1]
853 # Interpret ../ (important because of symlinks)
854 while basepath and path[:3] == '../':
855 path = path[3:]
Guido van Rossumb2493f82000-12-15 15:01:37 +0000856 i = basepath[:-1].rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000857 if i > 0:
858 basepath = basepath[:i+1]
859 elif i == 0:
860 basepath = '/'
861 break
862 else:
863 basepath = ''
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000864
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000865 path = basepath + path
866 if type and host: return type + '://' + host + path
867 elif type: return type + ':' + path
868 elif host: return '//' + host + path # don't know what this means
869 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000870
871
Guido van Rossum7c395db1994-07-04 22:14:49 +0000872# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000873# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000874# splittype('type:opaquestring') --> 'type', 'opaquestring'
875# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000876# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
877# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000878# splitport('host:port') --> 'host', 'port'
879# splitquery('/path?query') --> '/path', 'query'
880# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000881# splitattr('/path;attr1=value1;attr2=value2;...') ->
882# '/path', ['attr1=value1', 'attr2=value2', ...]
883# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000884# splitgophertype('/Xselector') --> 'X', 'selector'
885# unquote('abc%20def') -> 'abc def'
886# quote('abc def') -> 'abc%20def')
887
Martin v. Löwis1d994332000-12-03 18:30:10 +0000888def toBytes(url):
889 """toBytes(u"URL") --> 'URL'."""
890 # Most URL schemes require ASCII. If that changes, the conversion
891 # can be relaxed
892 if type(url) is types.UnicodeType:
893 try:
894 url = url.encode("ASCII")
895 except UnicodeError:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000896 raise UnicodeError("URL " + repr(url) +
897 " contains non-ASCII characters")
Martin v. Löwis1d994332000-12-03 18:30:10 +0000898 return url
899
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000900def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000901 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Guido van Rossumb2493f82000-12-15 15:01:37 +0000902 url = url.strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000903 if url[:1] == '<' and url[-1:] == '>':
Guido van Rossumb2493f82000-12-15 15:01:37 +0000904 url = url[1:-1].strip()
905 if url[:4] == 'URL:': url = url[4:].strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000906 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000907
Guido van Rossum332e1441997-09-29 23:23:46 +0000908_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000909def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000910 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000911 global _typeprog
912 if _typeprog is None:
913 import re
914 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000915
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000916 match = _typeprog.match(url)
917 if match:
918 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000919 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000920 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000921
Guido van Rossum332e1441997-09-29 23:23:46 +0000922_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000923def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000924 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000925 global _hostprog
926 if _hostprog is None:
927 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000928 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000929
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000930 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000931 if match: return match.group(1, 2)
932 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000933
Guido van Rossum332e1441997-09-29 23:23:46 +0000934_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000935def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000936 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000937 global _userprog
938 if _userprog is None:
939 import re
940 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000941
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000942 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000943 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000944 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000945
Guido van Rossum332e1441997-09-29 23:23:46 +0000946_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000947def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000948 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000949 global _passwdprog
950 if _passwdprog is None:
951 import re
952 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000953
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000954 match = _passwdprog.match(user)
955 if match: return match.group(1, 2)
956 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000957
Guido van Rossume7b146f2000-02-04 15:28:42 +0000958# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000959_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000960def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000961 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000962 global _portprog
963 if _portprog is None:
964 import re
965 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000966
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000967 match = _portprog.match(host)
968 if match: return match.group(1, 2)
969 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000970
Guido van Rossum332e1441997-09-29 23:23:46 +0000971_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000972def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000973 """Split host and port, returning numeric port.
974 Return given default port if no ':' found; defaults to -1.
975 Return numerical port if a valid number are found after ':'.
976 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000977 global _nportprog
978 if _nportprog is None:
979 import re
980 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000981
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000982 match = _nportprog.match(host)
983 if match:
984 host, port = match.group(1, 2)
985 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000986 if not port: raise ValueError, "no digits"
987 nport = int(port)
988 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000989 nport = None
990 return host, nport
991 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +0000992
Guido van Rossum332e1441997-09-29 23:23:46 +0000993_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000994def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000995 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000996 global _queryprog
997 if _queryprog is None:
998 import re
999 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001000
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001001 match = _queryprog.match(url)
1002 if match: return match.group(1, 2)
1003 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001004
Guido van Rossum332e1441997-09-29 23:23:46 +00001005_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001006def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001007 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001008 global _tagprog
1009 if _tagprog is None:
1010 import re
1011 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001012
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001013 match = _tagprog.match(url)
1014 if match: return match.group(1, 2)
1015 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001016
Guido van Rossum7c395db1994-07-04 22:14:49 +00001017def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001018 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1019 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Guido van Rossumb2493f82000-12-15 15:01:37 +00001020 words = url.split(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001021 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +00001022
Guido van Rossum332e1441997-09-29 23:23:46 +00001023_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001024def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001025 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001026 global _valueprog
1027 if _valueprog is None:
1028 import re
1029 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001030
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001031 match = _valueprog.match(attr)
1032 if match: return match.group(1, 2)
1033 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001034
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001035def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001036 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001037 if selector[:1] == '/' and selector[1:2]:
1038 return selector[1], selector[2:]
1039 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001040
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001041def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001042 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001043 mychr = chr
Guido van Rossumb2493f82000-12-15 15:01:37 +00001044 myatoi = int
1045 list = s.split('%')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001046 res = [list[0]]
1047 myappend = res.append
1048 del list[0]
1049 for item in list:
1050 if item[1:2]:
1051 try:
1052 myappend(mychr(myatoi(item[:2], 16))
1053 + item[2:])
1054 except:
1055 myappend('%' + item)
1056 else:
1057 myappend('%' + item)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001058 return "".join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001059
Guido van Rossum0564e121996-12-13 14:47:36 +00001060def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001061 """unquote('%7e/abc+def') -> '~/abc def'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001062 if '+' in s:
1063 # replace '+' with ' '
Guido van Rossumb2493f82000-12-15 15:01:37 +00001064 s = ' '.join(s.split('+'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001065 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001066
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001067always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001068 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001069 '0123456789' '_.-')
1070
1071_fast_safe_test = always_safe + '/'
1072_fast_safe = None
1073
1074def _fast_quote(s):
1075 global _fast_safe
1076 if _fast_safe is None:
1077 _fast_safe = {}
1078 for c in _fast_safe_test:
1079 _fast_safe[c] = c
1080 res = list(s)
1081 for i in range(len(res)):
1082 c = res[i]
1083 if not _fast_safe.has_key(c):
Guido van Rossume27a7b82001-01-19 03:28:15 +00001084 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001085 return ''.join(res)
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001086
Guido van Rossum7c395db1994-07-04 22:14:49 +00001087def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001088 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001089
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001090 Each part of a URL, e.g. the path info, the query, etc., has a
1091 different set of reserved characters that must be quoted.
1092
1093 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1094 the following reserved characters.
1095
1096 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1097 "$" | ","
1098
1099 Each of these characters is reserved in some component of a URL,
1100 but not necessarily in all of them.
1101
1102 By default, the quote function is intended for quoting the path
1103 section of a URL. Thus, it will not encode '/'. This character
1104 is reserved, but in typical usage the quote function is being
1105 called on a path where the existing slash characters are used as
1106 reserved characters.
1107 """
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001108 safe = always_safe + safe
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001109 if _fast_safe_test == safe:
1110 return _fast_quote(s)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001111 res = list(s)
1112 for i in range(len(res)):
1113 c = res[i]
1114 if c not in safe:
Guido van Rossume27a7b82001-01-19 03:28:15 +00001115 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001116 return ''.join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001117
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001118def quote_plus(s, safe = ''):
1119 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001120 if ' ' in s:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001121 l = s.split(' ')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001122 for i in range(len(l)):
1123 l[i] = quote(l[i], safe)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001124 return '+'.join(l)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001125 else:
1126 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001127
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001128def urlencode(query,doseq=0):
1129 """Encode a sequence of two-element tuples or dictionary into a URL query string.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001130
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001131 If any values in the query arg are sequences and doseq is true, each
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001132 sequence element is converted to a separate parameter.
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001133
1134 If the query arg is a sequence of two-element tuples, the order of the
1135 parameters in the output will match the order of parameters in the
1136 input.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001137 """
Tim Peters658cba62001-02-09 20:06:00 +00001138
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001139 if hasattr(query,"items"):
1140 # mapping objects
1141 query = query.items()
1142 else:
1143 # it's a bother at times that strings and string-like objects are
1144 # sequences...
1145 try:
1146 # non-sequence items should not work with len()
1147 x = len(query)
1148 # non-empty strings will fail this
1149 if len(query) and type(query[0]) != types.TupleType:
1150 raise TypeError
1151 # zero-length sequences of all types will get here and succeed,
1152 # but that's a minor nit - since the original implementation
1153 # allowed empty dicts that type of behavior probably should be
1154 # preserved for consistency
1155 except TypeError:
1156 ty,va,tb = sys.exc_info()
1157 raise TypeError, "not a valid non-string sequence or mapping object", tb
1158
Guido van Rossume7b146f2000-02-04 15:28:42 +00001159 l = []
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001160 if not doseq:
1161 # preserve old behavior
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001162 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001163 k = quote_plus(str(k))
1164 v = quote_plus(str(v))
1165 l.append(k + '=' + v)
1166 else:
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001167 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001168 k = quote_plus(str(k))
1169 if type(v) == types.StringType:
1170 v = quote_plus(v)
1171 l.append(k + '=' + v)
1172 elif type(v) == types.UnicodeType:
1173 # is there a reasonable way to convert to ASCII?
1174 # encode generates a string, but "replace" or "ignore"
1175 # lose information and "strict" can raise UnicodeError
1176 v = quote_plus(v.encode("ASCII","replace"))
1177 l.append(k + '=' + v)
1178 else:
1179 try:
1180 # is this a sufficient test for sequence-ness?
1181 x = len(v)
1182 except TypeError:
1183 # not a sequence
1184 v = quote_plus(str(v))
1185 l.append(k + '=' + v)
1186 else:
1187 # loop over the sequence
1188 for elt in v:
1189 l.append(k + '=' + quote_plus(str(elt)))
Guido van Rossumb2493f82000-12-15 15:01:37 +00001190 return '&'.join(l)
Guido van Rossum810a3391998-07-22 21:33:23 +00001191
Guido van Rossum442e7201996-03-20 15:33:11 +00001192# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001193def getproxies_environment():
1194 """Return a dictionary of scheme -> proxy server URL mappings.
1195
1196 Scan the environment for variables named <scheme>_proxy;
1197 this seems to be the standard convention. If you need a
1198 different way, you can pass a proxies dictionary to the
1199 [Fancy]URLopener constructor.
1200
1201 """
1202 proxies = {}
1203 for name, value in os.environ.items():
Guido van Rossumb2493f82000-12-15 15:01:37 +00001204 name = name.lower()
Mark Hammond4f570b92000-07-26 07:04:38 +00001205 if value and name[-6:] == '_proxy':
1206 proxies[name[:-6]] = value
1207 return proxies
1208
Guido van Rossum4163e701998-08-06 13:39:09 +00001209if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001210 def getproxies():
1211 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001212
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001213 By convention the mac uses Internet Config to store
1214 proxies. An HTTP proxy, for instance, is stored under
1215 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001216
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001217 """
1218 try:
1219 import ic
1220 except ImportError:
1221 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001222
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001223 try:
1224 config = ic.IC()
1225 except ic.error:
1226 return {}
1227 proxies = {}
1228 # HTTP:
1229 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1230 try:
1231 value = config['HTTPProxyHost']
1232 except ic.error:
1233 pass
1234 else:
1235 proxies['http'] = 'http://%s' % value
1236 # FTP: XXXX To be done.
1237 # Gopher: XXXX To be done.
1238 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001239
1240elif os.name == 'nt':
1241 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001242 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001243
1244 Win32 uses the registry to store proxies.
1245
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001246 """
1247 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001248 try:
1249 import _winreg
1250 except ImportError:
1251 # Std module, so should be around - but you never know!
1252 return proxies
1253 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001254 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1255 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001256 proxyEnable = _winreg.QueryValueEx(internetSettings,
1257 'ProxyEnable')[0]
1258 if proxyEnable:
1259 # Returned as Unicode but problems if not converted to ASCII
1260 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1261 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001262 if '=' in proxyServer:
1263 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001264 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001265 protocol, address = p.split('=', 1)
Mark Hammond4f570b92000-07-26 07:04:38 +00001266 proxies[protocol] = '%s://%s' % (protocol, address)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001267 else:
1268 # Use one setting for all protocols
1269 if proxyServer[:5] == 'http:':
1270 proxies['http'] = proxyServer
1271 else:
1272 proxies['http'] = 'http://%s' % proxyServer
1273 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001274 internetSettings.Close()
1275 except (WindowsError, ValueError, TypeError):
1276 # Either registry key not found etc, or the value in an
1277 # unexpected format.
1278 # proxies already set up to be empty so nothing to do
1279 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001280 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001281
Mark Hammond4f570b92000-07-26 07:04:38 +00001282 def getproxies():
1283 """Return a dictionary of scheme -> proxy server URL mappings.
1284
1285 Returns settings gathered from the environment, if specified,
1286 or the registry.
1287
1288 """
1289 return getproxies_environment() or getproxies_registry()
1290else:
1291 # By default use environment variables
1292 getproxies = getproxies_environment
1293
Guido van Rossum442e7201996-03-20 15:33:11 +00001294
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001295# Test and time quote() and unquote()
1296def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001297 import time
1298 s = ''
1299 for i in range(256): s = s + chr(i)
1300 s = s*4
1301 t0 = time.time()
1302 qs = quote(s)
1303 uqs = unquote(qs)
1304 t1 = time.time()
1305 if uqs != s:
1306 print 'Wrong!'
1307 print `s`
1308 print `qs`
1309 print `uqs`
1310 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001311
1312
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001313def reporthook(blocknum, blocksize, totalsize):
1314 # Report during remote transfers
Guido van Rossumb2493f82000-12-15 15:01:37 +00001315 print "Block number: %d, Block size: %d, Total size: %d" % (
1316 blocknum, blocksize, totalsize)
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001317
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001318# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001319def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001320 if not args:
1321 args = [
1322 '/etc/passwd',
1323 'file:/etc/passwd',
1324 'file://localhost/etc/passwd',
1325 'ftp://ftp.python.org/etc/passwd',
1326## 'gopher://gopher.micro.umn.edu/1/',
1327 'http://www.python.org/index.html',
1328 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001329 if hasattr(URLopener, "open_https"):
1330 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001331 try:
1332 for url in args:
1333 print '-'*10, url, '-'*10
1334 fn, h = urlretrieve(url, None, reporthook)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001335 print fn
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001336 if h:
1337 print '======'
1338 for k in h.keys(): print k + ':', h[k]
1339 print '======'
1340 fp = open(fn, 'rb')
1341 data = fp.read()
1342 del fp
1343 if '\r' in data:
1344 table = string.maketrans("", "")
Guido van Rossumb2493f82000-12-15 15:01:37 +00001345 data = data.translate(table, "\r")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001346 print data
1347 fn, h = None, None
1348 print '-'*40
1349 finally:
1350 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001351
Guido van Rossum23490151998-06-25 02:39:00 +00001352def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001353 import getopt, sys
1354 try:
1355 opts, args = getopt.getopt(sys.argv[1:], "th")
1356 except getopt.error, msg:
1357 print msg
1358 print "Use -h for help"
1359 return
1360 t = 0
1361 for o, a in opts:
1362 if o == '-t':
1363 t = t + 1
1364 if o == '-h':
1365 print "Usage: python urllib.py [-t] [url ...]"
1366 print "-t runs self-test;",
1367 print "otherwise, contents of urls are printed"
1368 return
1369 if t:
1370 if t > 1:
1371 test1()
1372 test(args)
1373 else:
1374 if not args:
1375 print "Use -h for help"
1376 for url in args:
1377 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001378
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001379# Run test program when run as a script
1380if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001381 main()