blob: 7f324286f9c147dc757d3327c5e451932a796e50 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossum3c8484e1996-11-20 22:02:24 +000028import sys
Martin v. Löwis1d994332000-12-03 18:30:10 +000029import types
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000030
Skip Montanaro40fc1602001-03-01 04:27:19 +000031__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
Jack Jansen282fed12001-03-05 13:45:38 +000033 "urlencode", "url2pathname", "pathname2url", "splittag"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000034
Guido van Rossumb2493f82000-12-15 15:01:37 +000035__version__ = '1.15' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000036
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000037MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000038
Jack Jansendc3e3f61995-12-15 13:22:13 +000039# Helper for non-unix systems
40if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000041 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000042elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000043 from nturl2path import url2pathname, pathname2url
Guido van Rossumd74fb6b2001-03-02 06:43:49 +000044elif os.name == 'riscos':
45 from rourl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000046else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000047 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000048 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000049 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000050 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000051
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000052# This really consists of two pieces:
53# (1) a class which handles opening of all sorts of URLs
54# (plus assorted utilities etc.)
55# (2) a set of functions for parsing URLs
56# XXX Should these be separated out into different modules?
57
58
59# Shortcut for basic usage
60_urlopener = None
Guido van Rossumbd013741996-12-10 16:00:28 +000061def urlopen(url, data=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000062 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000063 global _urlopener
64 if not _urlopener:
65 _urlopener = FancyURLopener()
66 if data is None:
67 return _urlopener.open(url)
68 else:
69 return _urlopener.open(url, data)
Fred Drake316a7932000-08-24 01:01:26 +000070def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000071 global _urlopener
72 if not _urlopener:
73 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000074 return _urlopener.retrieve(url, filename, reporthook, data)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000075def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000076 if _urlopener:
77 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000078
79
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000080ftpcache = {}
81class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000082 """Class to open URLs.
83 This is a class rather than just a subroutine because we may need
84 more than one set of global protocol-specific options.
85 Note -- this is a base class for those who don't want the
86 automatic handling of errors type 302 (relocated) and 401
87 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000088
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000089 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000090
Guido van Rossumba311382000-08-24 16:18:04 +000091 version = "Python-urllib/%s" % __version__
92
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000093 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000094 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000095 if proxies is None:
96 proxies = getproxies()
97 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
98 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000099 self.key_file = x509.get('key_file')
100 self.cert_file = x509.get('cert_file')
Guido van Rossumba311382000-08-24 16:18:04 +0000101 self.addheaders = [('User-agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000102 self.__tempfiles = []
103 self.__unlink = os.unlink # See cleanup()
104 self.tempcache = None
105 # Undocumented feature: if you assign {} to tempcache,
106 # it is used to cache files retrieved with
107 # self.retrieve(). This is not enabled by default
108 # since it does not work for changing documents (and I
109 # haven't got the logic to check expiration headers
110 # yet).
111 self.ftpcache = ftpcache
112 # Undocumented feature: you can use a different
113 # ftp cache by assigning to the .ftpcache member;
114 # in case you want logically independent URL openers
115 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000116
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000117 def __del__(self):
118 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000119
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000120 def close(self):
121 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000122
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000123 def cleanup(self):
124 # This code sometimes runs when the rest of this module
125 # has already been deleted, so it can't use any globals
126 # or import anything.
127 if self.__tempfiles:
128 for file in self.__tempfiles:
129 try:
130 self.__unlink(file)
131 except:
132 pass
133 del self.__tempfiles[:]
134 if self.tempcache:
135 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000136
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000137 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000138 """Add a header to be used by the HTTP interface only
139 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000140 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000141
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000142 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000143 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000144 """Use URLopener().open(file) instead of open(file, 'r')."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000145 fullurl = unwrap(toBytes(fullurl))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000146 if self.tempcache and self.tempcache.has_key(fullurl):
147 filename, headers = self.tempcache[fullurl]
148 fp = open(filename, 'rb')
149 return addinfourl(fp, headers, fullurl)
Martin v. Löwis1d994332000-12-03 18:30:10 +0000150 urltype, url = splittype(fullurl)
151 if not urltype:
152 urltype = 'file'
153 if self.proxies.has_key(urltype):
154 proxy = self.proxies[urltype]
155 urltype, proxyhost = splittype(proxy)
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000156 host, selector = splithost(proxyhost)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000157 url = (host, fullurl) # Signal special case to open_*()
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000158 else:
159 proxy = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000160 name = 'open_' + urltype
161 self.type = urltype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000162 if '-' in name:
163 # replace - with _
Guido van Rossumb2493f82000-12-15 15:01:37 +0000164 name = '_'.join(name.split('-'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000165 if not hasattr(self, name):
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000166 if proxy:
167 return self.open_unknown_proxy(proxy, fullurl, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000168 else:
169 return self.open_unknown(fullurl, data)
170 try:
171 if data is None:
172 return getattr(self, name)(url)
173 else:
174 return getattr(self, name)(url, data)
175 except socket.error, msg:
176 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000177
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000178 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000179 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000180 type, url = splittype(fullurl)
181 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000182
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000183 def open_unknown_proxy(self, proxy, fullurl, data=None):
184 """Overridable interface to open unknown URL type."""
185 type, url = splittype(fullurl)
186 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
187
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000188 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000189 def retrieve(self, url, filename=None, reporthook=None, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000190 """retrieve(url) returns (filename, None) for a local object
191 or (tempfilename, headers) for a remote object."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000192 url = unwrap(toBytes(url))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000193 if self.tempcache and self.tempcache.has_key(url):
194 return self.tempcache[url]
195 type, url1 = splittype(url)
196 if not filename and (not type or type == 'file'):
197 try:
198 fp = self.open_local_file(url1)
199 hdrs = fp.info()
200 del fp
201 return url2pathname(splithost(url1)[1]), hdrs
202 except IOError, msg:
203 pass
Fred Drake316a7932000-08-24 01:01:26 +0000204 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000205 headers = fp.info()
206 if not filename:
207 import tempfile
208 garbage, path = splittype(url)
209 garbage, path = splithost(path or "")
210 path, garbage = splitquery(path or "")
211 path, garbage = splitattr(path or "")
212 suffix = os.path.splitext(path)[1]
213 filename = tempfile.mktemp(suffix)
214 self.__tempfiles.append(filename)
215 result = filename, headers
216 if self.tempcache is not None:
217 self.tempcache[url] = result
218 tfp = open(filename, 'wb')
219 bs = 1024*8
220 size = -1
221 blocknum = 1
222 if reporthook:
223 if headers.has_key("content-length"):
224 size = int(headers["Content-Length"])
225 reporthook(0, bs, size)
226 block = fp.read(bs)
227 if reporthook:
228 reporthook(1, bs, size)
229 while block:
230 tfp.write(block)
231 block = fp.read(bs)
232 blocknum = blocknum + 1
233 if reporthook:
234 reporthook(blocknum, bs, size)
235 fp.close()
236 tfp.close()
237 del fp
238 del tfp
239 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000240
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000241 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000242
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000243 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000244 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000245 import httplib
246 user_passwd = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000247 if type(url) is types.StringType:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000248 host, selector = splithost(url)
249 if host:
250 user_passwd, host = splituser(host)
251 host = unquote(host)
252 realhost = host
253 else:
254 host, selector = url
255 urltype, rest = splittype(selector)
256 url = rest
257 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000258 if urltype.lower() != 'http':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000259 realhost = None
260 else:
261 realhost, rest = splithost(rest)
262 if realhost:
263 user_passwd, realhost = splituser(realhost)
264 if user_passwd:
265 selector = "%s://%s%s" % (urltype, realhost, rest)
266 #print "proxy via http:", host, selector
267 if not host: raise IOError, ('http error', 'no host given')
268 if user_passwd:
269 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000270 auth = base64.encodestring(user_passwd).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000271 else:
272 auth = None
273 h = httplib.HTTP(host)
274 if data is not None:
275 h.putrequest('POST', selector)
276 h.putheader('Content-type', 'application/x-www-form-urlencoded')
277 h.putheader('Content-length', '%d' % len(data))
278 else:
279 h.putrequest('GET', selector)
280 if auth: h.putheader('Authorization', 'Basic %s' % auth)
281 if realhost: h.putheader('Host', realhost)
282 for args in self.addheaders: apply(h.putheader, args)
283 h.endheaders()
284 if data is not None:
285 h.send(data + '\r\n')
286 errcode, errmsg, headers = h.getreply()
287 fp = h.getfile()
288 if errcode == 200:
289 return addinfourl(fp, headers, "http:" + url)
290 else:
291 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000292 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000293 else:
294 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000295
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000296 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000297 """Handle http errors.
298 Derived class can override this, or provide specific handlers
299 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000300 # First check if there's a specific handler for this error
301 name = 'http_error_%d' % errcode
302 if hasattr(self, name):
303 method = getattr(self, name)
304 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000305 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000306 else:
307 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000308 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000309 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000310
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000311 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000312 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000313 void = fp.read()
314 fp.close()
315 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000316
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000317 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000318 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000319 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000320 import httplib
Fred Drake567ca8e2000-08-21 21:42:42 +0000321 user_passwd = None
Moshe Zadkab2a0a832001-01-08 07:09:25 +0000322 if type(url) is types.StringType:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000323 host, selector = splithost(url)
Fred Drake567ca8e2000-08-21 21:42:42 +0000324 if host:
325 user_passwd, host = splituser(host)
326 host = unquote(host)
327 realhost = host
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000328 else:
329 host, selector = url
330 urltype, rest = splittype(selector)
Fred Drake567ca8e2000-08-21 21:42:42 +0000331 url = rest
332 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000333 if urltype.lower() != 'https':
Fred Drake567ca8e2000-08-21 21:42:42 +0000334 realhost = None
335 else:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000336 realhost, rest = splithost(rest)
Fred Drake567ca8e2000-08-21 21:42:42 +0000337 if realhost:
338 user_passwd, realhost = splituser(realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000339 if user_passwd:
340 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000341 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000342 if not host: raise IOError, ('https error', 'no host given')
343 if user_passwd:
344 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000345 auth = base64.encodestring(user_passwd).strip()
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000346 else:
347 auth = None
348 h = httplib.HTTPS(host, 0,
349 key_file=self.key_file,
350 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000351 if data is not None:
352 h.putrequest('POST', selector)
353 h.putheader('Content-type',
354 'application/x-www-form-urlencoded')
355 h.putheader('Content-length', '%d' % len(data))
356 else:
357 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000358 if auth: h.putheader('Authorization: Basic %s' % auth)
Fred Drake567ca8e2000-08-21 21:42:42 +0000359 if realhost: h.putheader('Host', realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000360 for args in self.addheaders: apply(h.putheader, args)
361 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000362 if data is not None:
363 h.send(data + '\r\n')
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000364 errcode, errmsg, headers = h.getreply()
365 fp = h.getfile()
366 if errcode == 200:
367 return addinfourl(fp, headers, url)
368 else:
Fred Drake567ca8e2000-08-21 21:42:42 +0000369 if data is None:
370 return self.http_error(url, fp, errcode, errmsg, headers)
371 else:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000372 return self.http_error(url, fp, errcode, errmsg, headers,
373 data)
Fred Drake567ca8e2000-08-21 21:42:42 +0000374
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000375 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000376 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000377 import gopherlib
378 host, selector = splithost(url)
379 if not host: raise IOError, ('gopher error', 'no host given')
380 host = unquote(host)
381 type, selector = splitgophertype(selector)
382 selector, query = splitquery(selector)
383 selector = unquote(selector)
384 if query:
385 query = unquote(query)
386 fp = gopherlib.send_query(selector, query, host)
387 else:
388 fp = gopherlib.send_selector(selector, host)
389 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000390
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000391 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000392 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000393 if url[:2] == '//' and url[2:3] != '/':
394 return self.open_ftp(url)
395 else:
396 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000397
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000398 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000399 """Use local file."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000400 import mimetypes, mimetools, StringIO
401 mtype = mimetypes.guess_type(url)[0]
402 headers = mimetools.Message(StringIO.StringIO(
403 'Content-Type: %s\n' % (mtype or 'text/plain')))
404 host, file = splithost(url)
405 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000406 urlfile = file
407 if file[:1] == '/':
408 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000409 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000410 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000411 host, port = splitport(host)
412 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000413 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000414 urlfile = file
415 if file[:1] == '/':
416 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000417 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000418 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000419 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000420
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000421 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000422 """Use FTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000423 host, path = splithost(url)
424 if not host: raise IOError, ('ftp error', 'no host given')
425 host, port = splitport(host)
426 user, host = splituser(host)
427 if user: user, passwd = splitpasswd(user)
428 else: passwd = None
429 host = unquote(host)
430 user = unquote(user or '')
431 passwd = unquote(passwd or '')
432 host = socket.gethostbyname(host)
433 if not port:
434 import ftplib
435 port = ftplib.FTP_PORT
436 else:
437 port = int(port)
438 path, attrs = splitattr(path)
439 path = unquote(path)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000440 dirs = path.split('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000441 dirs, file = dirs[:-1], dirs[-1]
442 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000443 if dirs and not dirs[0]: dirs[0] = '/'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000444 key = user, host, port, '/'.join(dirs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000445 # XXX thread unsafe!
446 if len(self.ftpcache) > MAXFTPCACHE:
447 # Prune the cache, rather arbitrarily
448 for k in self.ftpcache.keys():
449 if k != key:
450 v = self.ftpcache[k]
451 del self.ftpcache[k]
452 v.close()
453 try:
454 if not self.ftpcache.has_key(key):
455 self.ftpcache[key] = \
456 ftpwrapper(user, passwd, host, port, dirs)
457 if not file: type = 'D'
458 else: type = 'I'
459 for attr in attrs:
460 attr, value = splitvalue(attr)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000461 if attr.lower() == 'type' and \
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000462 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000463 type = value.upper()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000464 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
465 if retrlen is not None and retrlen >= 0:
466 import mimetools, StringIO
467 headers = mimetools.Message(StringIO.StringIO(
468 'Content-Length: %d\n' % retrlen))
469 else:
470 headers = noheaders()
471 return addinfourl(fp, headers, "ftp:" + url)
472 except ftperrors(), msg:
473 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000474
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000475 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000476 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000477 # ignore POSTed data
478 #
479 # syntax of data URLs:
480 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
481 # mediatype := [ type "/" subtype ] *( ";" parameter )
482 # data := *urlchar
483 # parameter := attribute "=" value
484 import StringIO, mimetools, time
485 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000486 [type, data] = url.split(',', 1)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000487 except ValueError:
488 raise IOError, ('data error', 'bad data URL')
489 if not type:
490 type = 'text/plain;charset=US-ASCII'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000491 semi = type.rfind(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000492 if semi >= 0 and '=' not in type[semi:]:
493 encoding = type[semi+1:]
494 type = type[:semi]
495 else:
496 encoding = ''
497 msg = []
498 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
499 time.gmtime(time.time())))
500 msg.append('Content-type: %s' % type)
501 if encoding == 'base64':
502 import base64
503 data = base64.decodestring(data)
504 else:
505 data = unquote(data)
506 msg.append('Content-length: %d' % len(data))
507 msg.append('')
508 msg.append(data)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000509 msg = '\n'.join(msg)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000510 f = StringIO.StringIO(msg)
511 headers = mimetools.Message(f, 0)
512 f.fileno = None # needed for addinfourl
513 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000514
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000515
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000516class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000517 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000518
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000519 def __init__(self, *args):
520 apply(URLopener.__init__, (self,) + args)
521 self.auth_cache = {}
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000522 self.tries = 0
523 self.maxtries = 10
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000524
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000525 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000526 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000527 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000528
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000529 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000530 """Error 302 -- relocated (temporarily)."""
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000531 self.tries += 1
532 if self.maxtries and self.tries >= self.maxtries:
533 if hasattr(self, "http_error_500"):
534 meth = self.http_error_500
535 else:
536 meth = self.http_error_default
537 self.tries = 0
538 return meth(url, fp, 500,
539 "Internal Server Error: Redirect Recursion", headers)
540 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
541 data)
542 self.tries = 0
543 return result
544
545 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000546 if headers.has_key('location'):
547 newurl = headers['location']
548 elif headers.has_key('uri'):
549 newurl = headers['uri']
550 else:
551 return
552 void = fp.read()
553 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000554 # In case the server sent a relative URL, join with original:
555 newurl = basejoin("http:" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000556 if data is None:
557 return self.open(newurl)
558 else:
559 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000560
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000561 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000562 """Error 301 -- also relocated (permanently)."""
563 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000564
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000565 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000566 """Error 401 -- authentication required.
567 See this URL for a description of the basic authentication scheme:
568 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Moshe Zadkae99bd172001-02-27 06:27:04 +0000569 if not headers.has_key('www-authenticate'):
Tim Peters85ba6732001-02-28 08:26:44 +0000570 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000571 errmsg, headers)
572 stuff = headers['www-authenticate']
573 import re
574 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
575 if not match:
Tim Peters85ba6732001-02-28 08:26:44 +0000576 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000577 errcode, errmsg, headers)
578 scheme, realm = match.groups()
579 if scheme.lower() != 'basic':
Tim Peters85ba6732001-02-28 08:26:44 +0000580 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000581 errcode, errmsg, headers)
582 name = 'retry_' + self.type + '_basic_auth'
583 if data is None:
584 return getattr(self,name)(url, realm)
585 else:
586 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000587
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000588 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000589 host, selector = splithost(url)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000590 i = host.find('@') + 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000591 host = host[i:]
592 user, passwd = self.get_user_passwd(host, realm, i)
593 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000594 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000595 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000596 if data is None:
597 return self.open(newurl)
598 else:
599 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000600
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000601 def retry_https_basic_auth(self, url, realm, data=None):
Tim Peterse1190062001-01-15 03:34:38 +0000602 host, selector = splithost(url)
603 i = host.find('@') + 1
604 host = host[i:]
605 user, passwd = self.get_user_passwd(host, realm, i)
606 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000607 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Tim Peterse1190062001-01-15 03:34:38 +0000608 newurl = '//' + host + selector
Guido van Rossumafc4f042001-01-15 18:31:13 +0000609 return self.open_https(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000610
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000611 def get_user_passwd(self, host, realm, clear_cache = 0):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000612 key = realm + '@' + host.lower()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000613 if self.auth_cache.has_key(key):
614 if clear_cache:
615 del self.auth_cache[key]
616 else:
617 return self.auth_cache[key]
618 user, passwd = self.prompt_user_passwd(host, realm)
619 if user or passwd: self.auth_cache[key] = (user, passwd)
620 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000621
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000622 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000623 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000624 import getpass
625 try:
626 user = raw_input("Enter username for %s at %s: " % (realm,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000627 host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000628 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
629 (user, realm, host))
630 return user, passwd
631 except KeyboardInterrupt:
632 print
633 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000634
635
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000636# Utility functions
637
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000638_localhost = None
639def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000640 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000641 global _localhost
642 if not _localhost:
643 _localhost = socket.gethostbyname('localhost')
644 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000645
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000646_thishost = None
647def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000648 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000649 global _thishost
650 if not _thishost:
651 _thishost = socket.gethostbyname(socket.gethostname())
652 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000653
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000654_ftperrors = None
655def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000656 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000657 global _ftperrors
658 if not _ftperrors:
659 import ftplib
660 _ftperrors = ftplib.all_errors
661 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000662
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000663_noheaders = None
664def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000665 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000666 global _noheaders
667 if not _noheaders:
668 import mimetools
669 import StringIO
670 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
671 _noheaders.fp.close() # Recycle file descriptor
672 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000673
674
675# Utility classes
676
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000677class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000678 """Class used by open_ftp() for cache of open FTP connections."""
679
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000680 def __init__(self, user, passwd, host, port, dirs):
681 self.user = user
682 self.passwd = passwd
683 self.host = host
684 self.port = port
685 self.dirs = dirs
686 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000687
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000688 def init(self):
689 import ftplib
690 self.busy = 0
691 self.ftp = ftplib.FTP()
692 self.ftp.connect(self.host, self.port)
693 self.ftp.login(self.user, self.passwd)
694 for dir in self.dirs:
695 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000696
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000697 def retrfile(self, file, type):
698 import ftplib
699 self.endtransfer()
700 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
701 else: cmd = 'TYPE ' + type; isdir = 0
702 try:
703 self.ftp.voidcmd(cmd)
704 except ftplib.all_errors:
705 self.init()
706 self.ftp.voidcmd(cmd)
707 conn = None
708 if file and not isdir:
709 # Use nlst to see if the file exists at all
710 try:
711 self.ftp.nlst(file)
712 except ftplib.error_perm, reason:
713 raise IOError, ('ftp error', reason), sys.exc_info()[2]
714 # Restore the transfer mode!
715 self.ftp.voidcmd(cmd)
716 # Try to retrieve as a file
717 try:
718 cmd = 'RETR ' + file
719 conn = self.ftp.ntransfercmd(cmd)
720 except ftplib.error_perm, reason:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000721 if str(reason)[:3] != '550':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000722 raise IOError, ('ftp error', reason), sys.exc_info()[2]
723 if not conn:
724 # Set transfer mode to ASCII!
725 self.ftp.voidcmd('TYPE A')
726 # Try a directory listing
727 if file: cmd = 'LIST ' + file
728 else: cmd = 'LIST'
729 conn = self.ftp.ntransfercmd(cmd)
730 self.busy = 1
731 # Pass back both a suitably decorated object and a retrieval length
732 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000733 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000734 def endtransfer(self):
735 if not self.busy:
736 return
737 self.busy = 0
738 try:
739 self.ftp.voidresp()
740 except ftperrors():
741 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000742
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000743 def close(self):
744 self.endtransfer()
745 try:
746 self.ftp.close()
747 except ftperrors():
748 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000749
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000750class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000751 """Base class for addinfo and addclosehook."""
752
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000753 def __init__(self, fp):
754 self.fp = fp
755 self.read = self.fp.read
756 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000757 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
758 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000759
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000760 def __repr__(self):
761 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000762 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000763
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000764 def close(self):
765 self.read = None
766 self.readline = None
767 self.readlines = None
768 self.fileno = None
769 if self.fp: self.fp.close()
770 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000771
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000772class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000773 """Class to add a close hook to an open file."""
774
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000775 def __init__(self, fp, closehook, *hookargs):
776 addbase.__init__(self, fp)
777 self.closehook = closehook
778 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000779
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000780 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000781 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000782 if self.closehook:
783 apply(self.closehook, self.hookargs)
784 self.closehook = None
785 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000786
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000787class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000788 """class to add an info() method to an open file."""
789
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000790 def __init__(self, fp, headers):
791 addbase.__init__(self, fp)
792 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000793
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000794 def info(self):
795 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000796
Guido van Rossume6ad8911996-09-10 17:02:56 +0000797class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000798 """class to add info() and geturl() methods to an open file."""
799
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000800 def __init__(self, fp, headers, url):
801 addbase.__init__(self, fp)
802 self.headers = headers
803 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000804
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000805 def info(self):
806 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000807
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000808 def geturl(self):
809 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000810
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000811
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000812def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000813 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000814 type, path = splittype(url)
815 if type:
816 # if url is complete (i.e., it contains a type), return it
817 return url
818 host, path = splithost(path)
819 type, basepath = splittype(base) # inherit type from base
820 if host:
821 # if url contains host, just inherit type
822 if type: return type + '://' + host + path
823 else:
824 # no type inherited, so url must have started with //
825 # just return it
826 return url
827 host, basepath = splithost(basepath) # inherit host
Thomas Wouters7e474022000-07-16 12:04:32 +0000828 basepath, basetag = splittag(basepath) # remove extraneous cruft
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000829 basepath, basequery = splitquery(basepath) # idem
830 if path[:1] != '/':
831 # non-absolute path name
832 if path[:1] in ('#', '?'):
833 # path is just a tag or query, attach to basepath
834 i = len(basepath)
835 else:
836 # else replace last component
Guido van Rossumb2493f82000-12-15 15:01:37 +0000837 i = basepath.rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000838 if i < 0:
839 # basepath not absolute
840 if host:
841 # host present, make absolute
842 basepath = '/'
843 else:
844 # else keep non-absolute
845 basepath = ''
846 else:
847 # remove last file component
848 basepath = basepath[:i+1]
849 # Interpret ../ (important because of symlinks)
850 while basepath and path[:3] == '../':
851 path = path[3:]
Guido van Rossumb2493f82000-12-15 15:01:37 +0000852 i = basepath[:-1].rfind('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000853 if i > 0:
854 basepath = basepath[:i+1]
855 elif i == 0:
856 basepath = '/'
857 break
858 else:
859 basepath = ''
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000860
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000861 path = basepath + path
862 if type and host: return type + '://' + host + path
863 elif type: return type + ':' + path
864 elif host: return '//' + host + path # don't know what this means
865 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000866
867
Guido van Rossum7c395db1994-07-04 22:14:49 +0000868# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000869# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000870# splittype('type:opaquestring') --> 'type', 'opaquestring'
871# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000872# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
873# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000874# splitport('host:port') --> 'host', 'port'
875# splitquery('/path?query') --> '/path', 'query'
876# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000877# splitattr('/path;attr1=value1;attr2=value2;...') ->
878# '/path', ['attr1=value1', 'attr2=value2', ...]
879# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000880# splitgophertype('/Xselector') --> 'X', 'selector'
881# unquote('abc%20def') -> 'abc def'
882# quote('abc def') -> 'abc%20def')
883
Martin v. Löwis1d994332000-12-03 18:30:10 +0000884def toBytes(url):
885 """toBytes(u"URL") --> 'URL'."""
886 # Most URL schemes require ASCII. If that changes, the conversion
887 # can be relaxed
888 if type(url) is types.UnicodeType:
889 try:
890 url = url.encode("ASCII")
891 except UnicodeError:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000892 raise UnicodeError("URL " + repr(url) +
893 " contains non-ASCII characters")
Martin v. Löwis1d994332000-12-03 18:30:10 +0000894 return url
895
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000896def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000897 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Guido van Rossumb2493f82000-12-15 15:01:37 +0000898 url = url.strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000899 if url[:1] == '<' and url[-1:] == '>':
Guido van Rossumb2493f82000-12-15 15:01:37 +0000900 url = url[1:-1].strip()
901 if url[:4] == 'URL:': url = url[4:].strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000902 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000903
Guido van Rossum332e1441997-09-29 23:23:46 +0000904_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000905def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000906 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000907 global _typeprog
908 if _typeprog is None:
909 import re
910 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000911
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000912 match = _typeprog.match(url)
913 if match:
914 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000915 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000916 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000917
Guido van Rossum332e1441997-09-29 23:23:46 +0000918_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000919def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000920 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000921 global _hostprog
922 if _hostprog is None:
923 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000924 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000925
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000926 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000927 if match: return match.group(1, 2)
928 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000929
Guido van Rossum332e1441997-09-29 23:23:46 +0000930_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000931def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000932 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000933 global _userprog
934 if _userprog is None:
935 import re
936 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000937
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000938 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000939 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000940 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000941
Guido van Rossum332e1441997-09-29 23:23:46 +0000942_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000943def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000944 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000945 global _passwdprog
946 if _passwdprog is None:
947 import re
948 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000949
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000950 match = _passwdprog.match(user)
951 if match: return match.group(1, 2)
952 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000953
Guido van Rossume7b146f2000-02-04 15:28:42 +0000954# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000955_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000956def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000957 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000958 global _portprog
959 if _portprog is None:
960 import re
961 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000962
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000963 match = _portprog.match(host)
964 if match: return match.group(1, 2)
965 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000966
Guido van Rossum332e1441997-09-29 23:23:46 +0000967_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000968def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000969 """Split host and port, returning numeric port.
970 Return given default port if no ':' found; defaults to -1.
971 Return numerical port if a valid number are found after ':'.
972 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000973 global _nportprog
974 if _nportprog is None:
975 import re
976 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000977
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000978 match = _nportprog.match(host)
979 if match:
980 host, port = match.group(1, 2)
981 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000982 if not port: raise ValueError, "no digits"
983 nport = int(port)
984 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000985 nport = None
986 return host, nport
987 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +0000988
Guido van Rossum332e1441997-09-29 23:23:46 +0000989_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000990def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000991 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000992 global _queryprog
993 if _queryprog is None:
994 import re
995 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000996
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000997 match = _queryprog.match(url)
998 if match: return match.group(1, 2)
999 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001000
Guido van Rossum332e1441997-09-29 23:23:46 +00001001_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001002def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001003 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001004 global _tagprog
1005 if _tagprog is None:
1006 import re
1007 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001008
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001009 match = _tagprog.match(url)
1010 if match: return match.group(1, 2)
1011 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001012
Guido van Rossum7c395db1994-07-04 22:14:49 +00001013def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001014 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1015 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Guido van Rossumb2493f82000-12-15 15:01:37 +00001016 words = url.split(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001017 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +00001018
Guido van Rossum332e1441997-09-29 23:23:46 +00001019_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001020def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001021 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001022 global _valueprog
1023 if _valueprog is None:
1024 import re
1025 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001026
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001027 match = _valueprog.match(attr)
1028 if match: return match.group(1, 2)
1029 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001030
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001031def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001032 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001033 if selector[:1] == '/' and selector[1:2]:
1034 return selector[1], selector[2:]
1035 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001036
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001037def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001038 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001039 mychr = chr
Guido van Rossumb2493f82000-12-15 15:01:37 +00001040 myatoi = int
1041 list = s.split('%')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001042 res = [list[0]]
1043 myappend = res.append
1044 del list[0]
1045 for item in list:
1046 if item[1:2]:
1047 try:
1048 myappend(mychr(myatoi(item[:2], 16))
1049 + item[2:])
1050 except:
1051 myappend('%' + item)
1052 else:
1053 myappend('%' + item)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001054 return "".join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001055
Guido van Rossum0564e121996-12-13 14:47:36 +00001056def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001057 """unquote('%7e/abc+def') -> '~/abc def'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001058 if '+' in s:
1059 # replace '+' with ' '
Guido van Rossumb2493f82000-12-15 15:01:37 +00001060 s = ' '.join(s.split('+'))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001061 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001062
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001063always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001064 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001065 '0123456789' '_.-')
1066
1067_fast_safe_test = always_safe + '/'
1068_fast_safe = None
1069
1070def _fast_quote(s):
1071 global _fast_safe
1072 if _fast_safe is None:
1073 _fast_safe = {}
1074 for c in _fast_safe_test:
1075 _fast_safe[c] = c
1076 res = list(s)
1077 for i in range(len(res)):
1078 c = res[i]
1079 if not _fast_safe.has_key(c):
Guido van Rossume27a7b82001-01-19 03:28:15 +00001080 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001081 return ''.join(res)
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001082
Guido van Rossum7c395db1994-07-04 22:14:49 +00001083def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001084 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001085
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001086 Each part of a URL, e.g. the path info, the query, etc., has a
1087 different set of reserved characters that must be quoted.
1088
1089 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1090 the following reserved characters.
1091
1092 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1093 "$" | ","
1094
1095 Each of these characters is reserved in some component of a URL,
1096 but not necessarily in all of them.
1097
1098 By default, the quote function is intended for quoting the path
1099 section of a URL. Thus, it will not encode '/'. This character
1100 is reserved, but in typical usage the quote function is being
1101 called on a path where the existing slash characters are used as
1102 reserved characters.
1103 """
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001104 safe = always_safe + safe
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001105 if _fast_safe_test == safe:
1106 return _fast_quote(s)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001107 res = list(s)
1108 for i in range(len(res)):
1109 c = res[i]
1110 if c not in safe:
Guido van Rossume27a7b82001-01-19 03:28:15 +00001111 res[i] = '%%%02X' % ord(c)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001112 return ''.join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001113
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001114def quote_plus(s, safe = ''):
1115 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001116 if ' ' in s:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001117 l = s.split(' ')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001118 for i in range(len(l)):
1119 l[i] = quote(l[i], safe)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001120 return '+'.join(l)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001121 else:
1122 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001123
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001124def urlencode(query,doseq=0):
1125 """Encode a sequence of two-element tuples or dictionary into a URL query string.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001126
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001127 If any values in the query arg are sequences and doseq is true, each
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001128 sequence element is converted to a separate parameter.
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001129
1130 If the query arg is a sequence of two-element tuples, the order of the
1131 parameters in the output will match the order of parameters in the
1132 input.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001133 """
Tim Peters658cba62001-02-09 20:06:00 +00001134
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001135 if hasattr(query,"items"):
1136 # mapping objects
1137 query = query.items()
1138 else:
1139 # it's a bother at times that strings and string-like objects are
1140 # sequences...
1141 try:
1142 # non-sequence items should not work with len()
1143 x = len(query)
1144 # non-empty strings will fail this
1145 if len(query) and type(query[0]) != types.TupleType:
1146 raise TypeError
1147 # zero-length sequences of all types will get here and succeed,
1148 # but that's a minor nit - since the original implementation
1149 # allowed empty dicts that type of behavior probably should be
1150 # preserved for consistency
1151 except TypeError:
1152 ty,va,tb = sys.exc_info()
1153 raise TypeError, "not a valid non-string sequence or mapping object", tb
1154
Guido van Rossume7b146f2000-02-04 15:28:42 +00001155 l = []
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001156 if not doseq:
1157 # preserve old behavior
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001158 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001159 k = quote_plus(str(k))
1160 v = quote_plus(str(v))
1161 l.append(k + '=' + v)
1162 else:
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001163 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001164 k = quote_plus(str(k))
1165 if type(v) == types.StringType:
1166 v = quote_plus(v)
1167 l.append(k + '=' + v)
1168 elif type(v) == types.UnicodeType:
1169 # is there a reasonable way to convert to ASCII?
1170 # encode generates a string, but "replace" or "ignore"
1171 # lose information and "strict" can raise UnicodeError
1172 v = quote_plus(v.encode("ASCII","replace"))
1173 l.append(k + '=' + v)
1174 else:
1175 try:
1176 # is this a sufficient test for sequence-ness?
1177 x = len(v)
1178 except TypeError:
1179 # not a sequence
1180 v = quote_plus(str(v))
1181 l.append(k + '=' + v)
1182 else:
1183 # loop over the sequence
1184 for elt in v:
1185 l.append(k + '=' + quote_plus(str(elt)))
Guido van Rossumb2493f82000-12-15 15:01:37 +00001186 return '&'.join(l)
Guido van Rossum810a3391998-07-22 21:33:23 +00001187
Guido van Rossum442e7201996-03-20 15:33:11 +00001188# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001189def getproxies_environment():
1190 """Return a dictionary of scheme -> proxy server URL mappings.
1191
1192 Scan the environment for variables named <scheme>_proxy;
1193 this seems to be the standard convention. If you need a
1194 different way, you can pass a proxies dictionary to the
1195 [Fancy]URLopener constructor.
1196
1197 """
1198 proxies = {}
1199 for name, value in os.environ.items():
Guido van Rossumb2493f82000-12-15 15:01:37 +00001200 name = name.lower()
Mark Hammond4f570b92000-07-26 07:04:38 +00001201 if value and name[-6:] == '_proxy':
1202 proxies[name[:-6]] = value
1203 return proxies
1204
Guido van Rossum4163e701998-08-06 13:39:09 +00001205if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001206 def getproxies():
1207 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001208
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001209 By convention the mac uses Internet Config to store
1210 proxies. An HTTP proxy, for instance, is stored under
1211 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001212
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001213 """
1214 try:
1215 import ic
1216 except ImportError:
1217 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001218
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001219 try:
1220 config = ic.IC()
1221 except ic.error:
1222 return {}
1223 proxies = {}
1224 # HTTP:
1225 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1226 try:
1227 value = config['HTTPProxyHost']
1228 except ic.error:
1229 pass
1230 else:
1231 proxies['http'] = 'http://%s' % value
1232 # FTP: XXXX To be done.
1233 # Gopher: XXXX To be done.
1234 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001235
1236elif os.name == 'nt':
1237 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001238 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001239
1240 Win32 uses the registry to store proxies.
1241
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001242 """
1243 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001244 try:
1245 import _winreg
1246 except ImportError:
1247 # Std module, so should be around - but you never know!
1248 return proxies
1249 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001250 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1251 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001252 proxyEnable = _winreg.QueryValueEx(internetSettings,
1253 'ProxyEnable')[0]
1254 if proxyEnable:
1255 # Returned as Unicode but problems if not converted to ASCII
1256 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1257 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001258 if '=' in proxyServer:
1259 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001260 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001261 protocol, address = p.split('=', 1)
Mark Hammond4f570b92000-07-26 07:04:38 +00001262 proxies[protocol] = '%s://%s' % (protocol, address)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001263 else:
1264 # Use one setting for all protocols
1265 if proxyServer[:5] == 'http:':
1266 proxies['http'] = proxyServer
1267 else:
1268 proxies['http'] = 'http://%s' % proxyServer
1269 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001270 internetSettings.Close()
1271 except (WindowsError, ValueError, TypeError):
1272 # Either registry key not found etc, or the value in an
1273 # unexpected format.
1274 # proxies already set up to be empty so nothing to do
1275 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001276 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001277
Mark Hammond4f570b92000-07-26 07:04:38 +00001278 def getproxies():
1279 """Return a dictionary of scheme -> proxy server URL mappings.
1280
1281 Returns settings gathered from the environment, if specified,
1282 or the registry.
1283
1284 """
1285 return getproxies_environment() or getproxies_registry()
1286else:
1287 # By default use environment variables
1288 getproxies = getproxies_environment
1289
Guido van Rossum442e7201996-03-20 15:33:11 +00001290
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001291# Test and time quote() and unquote()
1292def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001293 import time
1294 s = ''
1295 for i in range(256): s = s + chr(i)
1296 s = s*4
1297 t0 = time.time()
1298 qs = quote(s)
1299 uqs = unquote(qs)
1300 t1 = time.time()
1301 if uqs != s:
1302 print 'Wrong!'
1303 print `s`
1304 print `qs`
1305 print `uqs`
1306 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001307
1308
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001309def reporthook(blocknum, blocksize, totalsize):
1310 # Report during remote transfers
Guido van Rossumb2493f82000-12-15 15:01:37 +00001311 print "Block number: %d, Block size: %d, Total size: %d" % (
1312 blocknum, blocksize, totalsize)
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001313
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001314# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001315def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001316 if not args:
1317 args = [
1318 '/etc/passwd',
1319 'file:/etc/passwd',
1320 'file://localhost/etc/passwd',
1321 'ftp://ftp.python.org/etc/passwd',
1322## 'gopher://gopher.micro.umn.edu/1/',
1323 'http://www.python.org/index.html',
1324 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001325 if hasattr(URLopener, "open_https"):
1326 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001327 try:
1328 for url in args:
1329 print '-'*10, url, '-'*10
1330 fn, h = urlretrieve(url, None, reporthook)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001331 print fn
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001332 if h:
1333 print '======'
1334 for k in h.keys(): print k + ':', h[k]
1335 print '======'
1336 fp = open(fn, 'rb')
1337 data = fp.read()
1338 del fp
1339 if '\r' in data:
1340 table = string.maketrans("", "")
Guido van Rossumb2493f82000-12-15 15:01:37 +00001341 data = data.translate(table, "\r")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001342 print data
1343 fn, h = None, None
1344 print '-'*40
1345 finally:
1346 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001347
Guido van Rossum23490151998-06-25 02:39:00 +00001348def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001349 import getopt, sys
1350 try:
1351 opts, args = getopt.getopt(sys.argv[1:], "th")
1352 except getopt.error, msg:
1353 print msg
1354 print "Use -h for help"
1355 return
1356 t = 0
1357 for o, a in opts:
1358 if o == '-t':
1359 t = t + 1
1360 if o == '-h':
1361 print "Usage: python urllib.py [-t] [url ...]"
1362 print "-t runs self-test;",
1363 print "otherwise, contents of urls are printed"
1364 return
1365 if t:
1366 if t > 1:
1367 test1()
1368 test(args)
1369 else:
1370 if not args:
1371 print "Use -h for help"
1372 for url in args:
1373 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001374
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001375# Run test program when run as a script
1376if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001377 main()