blob: a1dcbdaedb858caf7767eba0ab9fe86a660dd457 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossum3c8484e1996-11-20 22:02:24 +000028import sys
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000029
30
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000031__version__ = '1.12' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000032
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000033MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000034
Jack Jansendc3e3f61995-12-15 13:22:13 +000035# Helper for non-unix systems
36if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000037 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000038elif os.name == 'nt':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000039 from nturl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000040else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000041 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000042 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000043 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000044 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000045
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000046# This really consists of two pieces:
47# (1) a class which handles opening of all sorts of URLs
48# (plus assorted utilities etc.)
49# (2) a set of functions for parsing URLs
50# XXX Should these be separated out into different modules?
51
52
53# Shortcut for basic usage
54_urlopener = None
Guido van Rossumbd013741996-12-10 16:00:28 +000055def urlopen(url, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000056 global _urlopener
57 if not _urlopener:
58 _urlopener = FancyURLopener()
59 if data is None:
60 return _urlopener.open(url)
61 else:
62 return _urlopener.open(url, data)
Guido van Rossum9ab96d41998-09-28 14:07:00 +000063def urlretrieve(url, filename=None, reporthook=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000064 global _urlopener
65 if not _urlopener:
66 _urlopener = FancyURLopener()
67 return _urlopener.retrieve(url, filename, reporthook)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000068def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000069 if _urlopener:
70 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000071
72
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000073ftpcache = {}
74class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000075 """Class to open URLs.
76 This is a class rather than just a subroutine because we may need
77 more than one set of global protocol-specific options.
78 Note -- this is a base class for those who don't want the
79 automatic handling of errors type 302 (relocated) and 401
80 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000081
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000082 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000083
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000084 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000085 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000086 if proxies is None:
87 proxies = getproxies()
88 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
89 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000090 self.key_file = x509.get('key_file')
91 self.cert_file = x509.get('cert_file')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000092 server_version = "Python-urllib/%s" % __version__
93 self.addheaders = [('User-agent', server_version)]
94 self.__tempfiles = []
95 self.__unlink = os.unlink # See cleanup()
96 self.tempcache = None
97 # Undocumented feature: if you assign {} to tempcache,
98 # it is used to cache files retrieved with
99 # self.retrieve(). This is not enabled by default
100 # since it does not work for changing documents (and I
101 # haven't got the logic to check expiration headers
102 # yet).
103 self.ftpcache = ftpcache
104 # Undocumented feature: you can use a different
105 # ftp cache by assigning to the .ftpcache member;
106 # in case you want logically independent URL openers
107 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000108
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000109 def __del__(self):
110 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000111
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000112 def close(self):
113 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000114
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000115 def cleanup(self):
116 # This code sometimes runs when the rest of this module
117 # has already been deleted, so it can't use any globals
118 # or import anything.
119 if self.__tempfiles:
120 for file in self.__tempfiles:
121 try:
122 self.__unlink(file)
123 except:
124 pass
125 del self.__tempfiles[:]
126 if self.tempcache:
127 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000128
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000129 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000130 """Add a header to be used by the HTTP interface only
131 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000132 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000133
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000134 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000135 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000136 """Use URLopener().open(file) instead of open(file, 'r')."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000137 fullurl = unwrap(fullurl)
138 if self.tempcache and self.tempcache.has_key(fullurl):
139 filename, headers = self.tempcache[fullurl]
140 fp = open(filename, 'rb')
141 return addinfourl(fp, headers, fullurl)
142 type, url = splittype(fullurl)
143 if not type: type = 'file'
144 if self.proxies.has_key(type):
145 proxy = self.proxies[type]
146 type, proxy = splittype(proxy)
147 host, selector = splithost(proxy)
148 url = (host, fullurl) # Signal special case to open_*()
149 name = 'open_' + type
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000150 self.type = type
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000151 if '-' in name:
152 # replace - with _
153 name = string.join(string.split(name, '-'), '_')
154 if not hasattr(self, name):
155 if data is None:
156 return self.open_unknown(fullurl)
157 else:
158 return self.open_unknown(fullurl, data)
159 try:
160 if data is None:
161 return getattr(self, name)(url)
162 else:
163 return getattr(self, name)(url, data)
164 except socket.error, msg:
165 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000166
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000167 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000168 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000169 type, url = splittype(fullurl)
170 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000171
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000172 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000173 def retrieve(self, url, filename=None, reporthook=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000174 """retrieve(url) returns (filename, None) for a local object
175 or (tempfilename, headers) for a remote object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000176 url = unwrap(url)
177 if self.tempcache and self.tempcache.has_key(url):
178 return self.tempcache[url]
179 type, url1 = splittype(url)
180 if not filename and (not type or type == 'file'):
181 try:
182 fp = self.open_local_file(url1)
183 hdrs = fp.info()
184 del fp
185 return url2pathname(splithost(url1)[1]), hdrs
186 except IOError, msg:
187 pass
188 fp = self.open(url)
189 headers = fp.info()
190 if not filename:
191 import tempfile
192 garbage, path = splittype(url)
193 garbage, path = splithost(path or "")
194 path, garbage = splitquery(path or "")
195 path, garbage = splitattr(path or "")
196 suffix = os.path.splitext(path)[1]
197 filename = tempfile.mktemp(suffix)
198 self.__tempfiles.append(filename)
199 result = filename, headers
200 if self.tempcache is not None:
201 self.tempcache[url] = result
202 tfp = open(filename, 'wb')
203 bs = 1024*8
204 size = -1
205 blocknum = 1
206 if reporthook:
207 if headers.has_key("content-length"):
208 size = int(headers["Content-Length"])
209 reporthook(0, bs, size)
210 block = fp.read(bs)
211 if reporthook:
212 reporthook(1, bs, size)
213 while block:
214 tfp.write(block)
215 block = fp.read(bs)
216 blocknum = blocknum + 1
217 if reporthook:
218 reporthook(blocknum, bs, size)
219 fp.close()
220 tfp.close()
221 del fp
222 del tfp
223 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000224
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000225 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000226
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000227 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000228 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000229 import httplib
230 user_passwd = None
231 if type(url) is type(""):
232 host, selector = splithost(url)
233 if host:
234 user_passwd, host = splituser(host)
235 host = unquote(host)
236 realhost = host
237 else:
238 host, selector = url
239 urltype, rest = splittype(selector)
240 url = rest
241 user_passwd = None
242 if string.lower(urltype) != 'http':
243 realhost = None
244 else:
245 realhost, rest = splithost(rest)
246 if realhost:
247 user_passwd, realhost = splituser(realhost)
248 if user_passwd:
249 selector = "%s://%s%s" % (urltype, realhost, rest)
250 #print "proxy via http:", host, selector
251 if not host: raise IOError, ('http error', 'no host given')
252 if user_passwd:
253 import base64
254 auth = string.strip(base64.encodestring(user_passwd))
255 else:
256 auth = None
257 h = httplib.HTTP(host)
258 if data is not None:
259 h.putrequest('POST', selector)
260 h.putheader('Content-type', 'application/x-www-form-urlencoded')
261 h.putheader('Content-length', '%d' % len(data))
262 else:
263 h.putrequest('GET', selector)
264 if auth: h.putheader('Authorization', 'Basic %s' % auth)
265 if realhost: h.putheader('Host', realhost)
266 for args in self.addheaders: apply(h.putheader, args)
267 h.endheaders()
268 if data is not None:
269 h.send(data + '\r\n')
270 errcode, errmsg, headers = h.getreply()
271 fp = h.getfile()
272 if errcode == 200:
273 return addinfourl(fp, headers, "http:" + url)
274 else:
275 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000276 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000277 else:
278 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000279
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000280 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000281 """Handle http errors.
282 Derived class can override this, or provide specific handlers
283 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000284 # First check if there's a specific handler for this error
285 name = 'http_error_%d' % errcode
286 if hasattr(self, name):
287 method = getattr(self, name)
288 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000289 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000290 else:
291 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000292 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000293 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000294
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000295 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000296 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000297 void = fp.read()
298 fp.close()
299 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000300
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000301 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000302 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000303 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000304 import httplib
305 if type(url) is type(""):
306 host, selector = splithost(url)
307 user_passwd, host = splituser(host)
308 else:
309 host, selector = url
310 urltype, rest = splittype(selector)
311 if string.lower(urltype) == 'https':
312 realhost, rest = splithost(rest)
313 user_passwd, realhost = splituser(realhost)
314 if user_passwd:
315 selector = "%s://%s%s" % (urltype, realhost, rest)
316 print "proxy via https:", host, selector
317 if not host: raise IOError, ('https error', 'no host given')
318 if user_passwd:
319 import base64
320 auth = string.strip(base64.encodestring(user_passwd))
321 else:
322 auth = None
323 h = httplib.HTTPS(host, 0,
324 key_file=self.key_file,
325 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000326 if data is not None:
327 h.putrequest('POST', selector)
328 h.putheader('Content-type',
329 'application/x-www-form-urlencoded')
330 h.putheader('Content-length', '%d' % len(data))
331 else:
332 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000333 if auth: h.putheader('Authorization: Basic %s' % auth)
334 for args in self.addheaders: apply(h.putheader, args)
335 h.endheaders()
336 errcode, errmsg, headers = h.getreply()
337 fp = h.getfile()
338 if errcode == 200:
339 return addinfourl(fp, headers, url)
340 else:
341 return self.http_error(url, fp, errcode, errmsg, headers)
342
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000343 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000344 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000345 import gopherlib
346 host, selector = splithost(url)
347 if not host: raise IOError, ('gopher error', 'no host given')
348 host = unquote(host)
349 type, selector = splitgophertype(selector)
350 selector, query = splitquery(selector)
351 selector = unquote(selector)
352 if query:
353 query = unquote(query)
354 fp = gopherlib.send_query(selector, query, host)
355 else:
356 fp = gopherlib.send_selector(selector, host)
357 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000358
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000359 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000360 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000361 if url[:2] == '//' and url[2:3] != '/':
362 return self.open_ftp(url)
363 else:
364 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000365
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000366 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000367 """Use local file."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000368 import mimetypes, mimetools, StringIO
369 mtype = mimetypes.guess_type(url)[0]
370 headers = mimetools.Message(StringIO.StringIO(
371 'Content-Type: %s\n' % (mtype or 'text/plain')))
372 host, file = splithost(url)
373 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000374 urlfile = file
375 if file[:1] == '/':
376 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000377 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000378 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000379 host, port = splitport(host)
380 if not port \
381 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000382 urlfile = file
383 if file[:1] == '/':
384 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000385 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000386 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000387 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000388
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000389 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000390 """Use FTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000391 host, path = splithost(url)
392 if not host: raise IOError, ('ftp error', 'no host given')
393 host, port = splitport(host)
394 user, host = splituser(host)
395 if user: user, passwd = splitpasswd(user)
396 else: passwd = None
397 host = unquote(host)
398 user = unquote(user or '')
399 passwd = unquote(passwd or '')
400 host = socket.gethostbyname(host)
401 if not port:
402 import ftplib
403 port = ftplib.FTP_PORT
404 else:
405 port = int(port)
406 path, attrs = splitattr(path)
407 path = unquote(path)
408 dirs = string.splitfields(path, '/')
409 dirs, file = dirs[:-1], dirs[-1]
410 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000411 if dirs and not dirs[0]: dirs[0] = '/'
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000412 key = (user, host, port, string.joinfields(dirs, '/'))
413 # XXX thread unsafe!
414 if len(self.ftpcache) > MAXFTPCACHE:
415 # Prune the cache, rather arbitrarily
416 for k in self.ftpcache.keys():
417 if k != key:
418 v = self.ftpcache[k]
419 del self.ftpcache[k]
420 v.close()
421 try:
422 if not self.ftpcache.has_key(key):
423 self.ftpcache[key] = \
424 ftpwrapper(user, passwd, host, port, dirs)
425 if not file: type = 'D'
426 else: type = 'I'
427 for attr in attrs:
428 attr, value = splitvalue(attr)
429 if string.lower(attr) == 'type' and \
430 value in ('a', 'A', 'i', 'I', 'd', 'D'):
431 type = string.upper(value)
432 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
433 if retrlen is not None and retrlen >= 0:
434 import mimetools, StringIO
435 headers = mimetools.Message(StringIO.StringIO(
436 'Content-Length: %d\n' % retrlen))
437 else:
438 headers = noheaders()
439 return addinfourl(fp, headers, "ftp:" + url)
440 except ftperrors(), msg:
441 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000442
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000443 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000444 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000445 # ignore POSTed data
446 #
447 # syntax of data URLs:
448 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
449 # mediatype := [ type "/" subtype ] *( ";" parameter )
450 # data := *urlchar
451 # parameter := attribute "=" value
452 import StringIO, mimetools, time
453 try:
454 [type, data] = string.split(url, ',', 1)
455 except ValueError:
456 raise IOError, ('data error', 'bad data URL')
457 if not type:
458 type = 'text/plain;charset=US-ASCII'
459 semi = string.rfind(type, ';')
460 if semi >= 0 and '=' not in type[semi:]:
461 encoding = type[semi+1:]
462 type = type[:semi]
463 else:
464 encoding = ''
465 msg = []
466 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
467 time.gmtime(time.time())))
468 msg.append('Content-type: %s' % type)
469 if encoding == 'base64':
470 import base64
471 data = base64.decodestring(data)
472 else:
473 data = unquote(data)
474 msg.append('Content-length: %d' % len(data))
475 msg.append('')
476 msg.append(data)
477 msg = string.join(msg, '\n')
478 f = StringIO.StringIO(msg)
479 headers = mimetools.Message(f, 0)
480 f.fileno = None # needed for addinfourl
481 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000482
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000483
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000484class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000485 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000486
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000487 def __init__(self, *args):
488 apply(URLopener.__init__, (self,) + args)
489 self.auth_cache = {}
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000490
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000491 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000492 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000493 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000494
Guido van Rossume7b146f2000-02-04 15:28:42 +0000495 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
496 """Error 302 -- relocated (temporarily)."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000497 # XXX The server can force infinite recursion here!
498 if headers.has_key('location'):
499 newurl = headers['location']
500 elif headers.has_key('uri'):
501 newurl = headers['uri']
502 else:
503 return
504 void = fp.read()
505 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000506 # In case the server sent a relative URL, join with original:
507 newurl = basejoin("http:" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000508 if data is None:
509 return self.open(newurl)
510 else:
511 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000512
Guido van Rossume7b146f2000-02-04 15:28:42 +0000513 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
514 """Error 301 -- also relocated (permanently)."""
515 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000516
Guido van Rossume7b146f2000-02-04 15:28:42 +0000517 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
518 """Error 401 -- authentication required.
519 See this URL for a description of the basic authentication scheme:
520 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000521 if headers.has_key('www-authenticate'):
522 stuff = headers['www-authenticate']
523 import re
524 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
525 if match:
526 scheme, realm = match.groups()
527 if string.lower(scheme) == 'basic':
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000528 name = 'retry_' + self.type + '_basic_auth'
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000529 if data is None:
530 return getattr(self,name)(url, realm)
531 else:
532 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000533
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000534 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000535 host, selector = splithost(url)
536 i = string.find(host, '@') + 1
537 host = host[i:]
538 user, passwd = self.get_user_passwd(host, realm, i)
539 if not (user or passwd): return None
540 host = user + ':' + passwd + '@' + host
541 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000542 if data is None:
543 return self.open(newurl)
544 else:
545 return self.open(newurl, data)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000546
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000547 def retry_https_basic_auth(self, url, realm, data=None):
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000548 host, selector = splithost(url)
549 i = string.find(host, '@') + 1
550 host = host[i:]
551 user, passwd = self.get_user_passwd(host, realm, i)
552 if not (user or passwd): return None
553 host = user + ':' + passwd + '@' + host
554 newurl = '//' + host + selector
555 return self.open_https(newurl)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000556
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000557 def get_user_passwd(self, host, realm, clear_cache = 0):
558 key = realm + '@' + string.lower(host)
559 if self.auth_cache.has_key(key):
560 if clear_cache:
561 del self.auth_cache[key]
562 else:
563 return self.auth_cache[key]
564 user, passwd = self.prompt_user_passwd(host, realm)
565 if user or passwd: self.auth_cache[key] = (user, passwd)
566 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000567
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000568 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000569 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000570 import getpass
571 try:
572 user = raw_input("Enter username for %s at %s: " % (realm,
573 host))
574 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
575 (user, realm, host))
576 return user, passwd
577 except KeyboardInterrupt:
578 print
579 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000580
581
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000582# Utility functions
583
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000584_localhost = None
585def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000586 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000587 global _localhost
588 if not _localhost:
589 _localhost = socket.gethostbyname('localhost')
590 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000591
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000592_thishost = None
593def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000594 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000595 global _thishost
596 if not _thishost:
597 _thishost = socket.gethostbyname(socket.gethostname())
598 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000599
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000600_ftperrors = None
601def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000602 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000603 global _ftperrors
604 if not _ftperrors:
605 import ftplib
606 _ftperrors = ftplib.all_errors
607 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000608
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000609_noheaders = None
610def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000611 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000612 global _noheaders
613 if not _noheaders:
614 import mimetools
615 import StringIO
616 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
617 _noheaders.fp.close() # Recycle file descriptor
618 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000619
620
621# Utility classes
622
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000623class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000624 """Class used by open_ftp() for cache of open FTP connections."""
625
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000626 def __init__(self, user, passwd, host, port, dirs):
627 self.user = user
628 self.passwd = passwd
629 self.host = host
630 self.port = port
631 self.dirs = dirs
632 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000633
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000634 def init(self):
635 import ftplib
636 self.busy = 0
637 self.ftp = ftplib.FTP()
638 self.ftp.connect(self.host, self.port)
639 self.ftp.login(self.user, self.passwd)
640 for dir in self.dirs:
641 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000642
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000643 def retrfile(self, file, type):
644 import ftplib
645 self.endtransfer()
646 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
647 else: cmd = 'TYPE ' + type; isdir = 0
648 try:
649 self.ftp.voidcmd(cmd)
650 except ftplib.all_errors:
651 self.init()
652 self.ftp.voidcmd(cmd)
653 conn = None
654 if file and not isdir:
655 # Use nlst to see if the file exists at all
656 try:
657 self.ftp.nlst(file)
658 except ftplib.error_perm, reason:
659 raise IOError, ('ftp error', reason), sys.exc_info()[2]
660 # Restore the transfer mode!
661 self.ftp.voidcmd(cmd)
662 # Try to retrieve as a file
663 try:
664 cmd = 'RETR ' + file
665 conn = self.ftp.ntransfercmd(cmd)
666 except ftplib.error_perm, reason:
667 if reason[:3] != '550':
668 raise IOError, ('ftp error', reason), sys.exc_info()[2]
669 if not conn:
670 # Set transfer mode to ASCII!
671 self.ftp.voidcmd('TYPE A')
672 # Try a directory listing
673 if file: cmd = 'LIST ' + file
674 else: cmd = 'LIST'
675 conn = self.ftp.ntransfercmd(cmd)
676 self.busy = 1
677 # Pass back both a suitably decorated object and a retrieval length
678 return (addclosehook(conn[0].makefile('rb'),
679 self.endtransfer), conn[1])
680 def endtransfer(self):
681 if not self.busy:
682 return
683 self.busy = 0
684 try:
685 self.ftp.voidresp()
686 except ftperrors():
687 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000688
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000689 def close(self):
690 self.endtransfer()
691 try:
692 self.ftp.close()
693 except ftperrors():
694 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000695
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000696class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000697 """Base class for addinfo and addclosehook."""
698
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000699 def __init__(self, fp):
700 self.fp = fp
701 self.read = self.fp.read
702 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000703 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
704 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000705
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000706 def __repr__(self):
707 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
708 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000709
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000710 def close(self):
711 self.read = None
712 self.readline = None
713 self.readlines = None
714 self.fileno = None
715 if self.fp: self.fp.close()
716 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000717
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000718class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000719 """Class to add a close hook to an open file."""
720
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000721 def __init__(self, fp, closehook, *hookargs):
722 addbase.__init__(self, fp)
723 self.closehook = closehook
724 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000725
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000726 def close(self):
727 if self.closehook:
728 apply(self.closehook, self.hookargs)
729 self.closehook = None
730 self.hookargs = None
731 addbase.close(self)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000732
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000733class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000734 """class to add an info() method to an open file."""
735
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000736 def __init__(self, fp, headers):
737 addbase.__init__(self, fp)
738 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000739
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000740 def info(self):
741 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000742
Guido van Rossume6ad8911996-09-10 17:02:56 +0000743class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000744 """class to add info() and geturl() methods to an open file."""
745
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000746 def __init__(self, fp, headers, url):
747 addbase.__init__(self, fp)
748 self.headers = headers
749 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000750
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000751 def info(self):
752 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000753
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000754 def geturl(self):
755 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000756
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000757
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000758def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000759 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000760 type, path = splittype(url)
761 if type:
762 # if url is complete (i.e., it contains a type), return it
763 return url
764 host, path = splithost(path)
765 type, basepath = splittype(base) # inherit type from base
766 if host:
767 # if url contains host, just inherit type
768 if type: return type + '://' + host + path
769 else:
770 # no type inherited, so url must have started with //
771 # just return it
772 return url
773 host, basepath = splithost(basepath) # inherit host
774 basepath, basetag = splittag(basepath) # remove extraneuous cruft
775 basepath, basequery = splitquery(basepath) # idem
776 if path[:1] != '/':
777 # non-absolute path name
778 if path[:1] in ('#', '?'):
779 # path is just a tag or query, attach to basepath
780 i = len(basepath)
781 else:
782 # else replace last component
783 i = string.rfind(basepath, '/')
784 if i < 0:
785 # basepath not absolute
786 if host:
787 # host present, make absolute
788 basepath = '/'
789 else:
790 # else keep non-absolute
791 basepath = ''
792 else:
793 # remove last file component
794 basepath = basepath[:i+1]
795 # Interpret ../ (important because of symlinks)
796 while basepath and path[:3] == '../':
797 path = path[3:]
798 i = string.rfind(basepath[:-1], '/')
799 if i > 0:
800 basepath = basepath[:i+1]
801 elif i == 0:
802 basepath = '/'
803 break
804 else:
805 basepath = ''
806
807 path = basepath + path
808 if type and host: return type + '://' + host + path
809 elif type: return type + ':' + path
810 elif host: return '//' + host + path # don't know what this means
811 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000812
813
Guido van Rossum7c395db1994-07-04 22:14:49 +0000814# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000815# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000816# splittype('type:opaquestring') --> 'type', 'opaquestring'
817# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000818# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
819# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000820# splitport('host:port') --> 'host', 'port'
821# splitquery('/path?query') --> '/path', 'query'
822# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000823# splitattr('/path;attr1=value1;attr2=value2;...') ->
824# '/path', ['attr1=value1', 'attr2=value2', ...]
825# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000826# splitgophertype('/Xselector') --> 'X', 'selector'
827# unquote('abc%20def') -> 'abc def'
828# quote('abc def') -> 'abc%20def')
829
830def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000831 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000832 url = string.strip(url)
833 if url[:1] == '<' and url[-1:] == '>':
834 url = string.strip(url[1:-1])
835 if url[:4] == 'URL:': url = string.strip(url[4:])
836 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000837
Guido van Rossum332e1441997-09-29 23:23:46 +0000838_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000839def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000840 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000841 global _typeprog
842 if _typeprog is None:
843 import re
844 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000845
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000846 match = _typeprog.match(url)
847 if match:
848 scheme = match.group(1)
849 return scheme, url[len(scheme) + 1:]
850 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000851
Guido van Rossum332e1441997-09-29 23:23:46 +0000852_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000853def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000854 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000855 global _hostprog
856 if _hostprog is None:
857 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000858 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000859
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000860 match = _hostprog.match(url)
861 if match: return match.group(1, 2)
862 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000863
Guido van Rossum332e1441997-09-29 23:23:46 +0000864_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000865def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000866 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000867 global _userprog
868 if _userprog is None:
869 import re
870 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000871
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000872 match = _userprog.match(host)
873 if match: return match.group(1, 2)
874 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000875
Guido van Rossum332e1441997-09-29 23:23:46 +0000876_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000877def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000878 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000879 global _passwdprog
880 if _passwdprog is None:
881 import re
882 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000883
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000884 match = _passwdprog.match(user)
885 if match: return match.group(1, 2)
886 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000887
Guido van Rossume7b146f2000-02-04 15:28:42 +0000888# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000889_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000890def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000891 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000892 global _portprog
893 if _portprog is None:
894 import re
895 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000896
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000897 match = _portprog.match(host)
898 if match: return match.group(1, 2)
899 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000900
Guido van Rossum332e1441997-09-29 23:23:46 +0000901_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000902def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000903 """Split host and port, returning numeric port.
904 Return given default port if no ':' found; defaults to -1.
905 Return numerical port if a valid number are found after ':'.
906 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000907 global _nportprog
908 if _nportprog is None:
909 import re
910 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000911
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000912 match = _nportprog.match(host)
913 if match:
914 host, port = match.group(1, 2)
915 try:
916 if not port: raise string.atoi_error, "no digits"
917 nport = string.atoi(port)
918 except string.atoi_error:
919 nport = None
920 return host, nport
921 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +0000922
Guido van Rossum332e1441997-09-29 23:23:46 +0000923_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000924def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000925 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000926 global _queryprog
927 if _queryprog is None:
928 import re
929 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000930
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000931 match = _queryprog.match(url)
932 if match: return match.group(1, 2)
933 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000934
Guido van Rossum332e1441997-09-29 23:23:46 +0000935_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000936def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000937 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000938 global _tagprog
939 if _tagprog is None:
940 import re
941 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000942
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000943 match = _tagprog.match(url)
944 if match: return match.group(1, 2)
945 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000946
Guido van Rossum7c395db1994-07-04 22:14:49 +0000947def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000948 """splitattr('/path;attr1=value1;attr2=value2;...') ->
949 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000950 words = string.splitfields(url, ';')
951 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +0000952
Guido van Rossum332e1441997-09-29 23:23:46 +0000953_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000954def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000955 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000956 global _valueprog
957 if _valueprog is None:
958 import re
959 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000960
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000961 match = _valueprog.match(attr)
962 if match: return match.group(1, 2)
963 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000964
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000965def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000966 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000967 if selector[:1] == '/' and selector[1:2]:
968 return selector[1], selector[2:]
969 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000970
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000971def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000972 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000973 mychr = chr
974 myatoi = string.atoi
975 list = string.split(s, '%')
976 res = [list[0]]
977 myappend = res.append
978 del list[0]
979 for item in list:
980 if item[1:2]:
981 try:
982 myappend(mychr(myatoi(item[:2], 16))
983 + item[2:])
984 except:
985 myappend('%' + item)
986 else:
987 myappend('%' + item)
988 return string.join(res, "")
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000989
Guido van Rossum0564e121996-12-13 14:47:36 +0000990def unquote_plus(s):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000991 if '+' in s:
992 # replace '+' with ' '
993 s = string.join(string.split(s, '+'), ' ')
994 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +0000995
Guido van Rossum3bb54481994-08-29 10:52:58 +0000996always_safe = string.letters + string.digits + '_,.-'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000997def quote(s, safe = '/'):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000998 """quote('abc def') -> 'abc%20def')."""
Guido van Rossum0dee4ee1999-06-09 15:14:50 +0000999 # XXX Can speed this up an order of magnitude
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001000 safe = always_safe + safe
1001 res = list(s)
1002 for i in range(len(res)):
1003 c = res[i]
1004 if c not in safe:
1005 res[i] = '%%%02x' % ord(c)
1006 return string.joinfields(res, '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001007
Guido van Rossum0564e121996-12-13 14:47:36 +00001008def quote_plus(s, safe = '/'):
Guido van Rossum0dee4ee1999-06-09 15:14:50 +00001009 # XXX Can speed this up an order of magnitude
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001010 if ' ' in s:
1011 # replace ' ' with '+'
1012 l = string.split(s, ' ')
1013 for i in range(len(l)):
1014 l[i] = quote(l[i], safe)
1015 return string.join(l, '+')
1016 else:
1017 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001018
Guido van Rossum810a3391998-07-22 21:33:23 +00001019def urlencode(dict):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001020 """Encode a dictionary of form entries into a URL query string."""
1021 l = []
1022 for k, v in dict.items():
1023 k = quote_plus(str(k))
1024 v = quote_plus(str(v))
1025 l.append(k + '=' + v)
1026 return string.join(l, '&')
Guido van Rossum810a3391998-07-22 21:33:23 +00001027
Guido van Rossum442e7201996-03-20 15:33:11 +00001028
1029# Proxy handling
Guido van Rossum4163e701998-08-06 13:39:09 +00001030if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001031 def getproxies():
1032 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001033
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001034 By convention the mac uses Internet Config to store
1035 proxies. An HTTP proxy, for instance, is stored under
1036 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001037
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001038 """
1039 try:
1040 import ic
1041 except ImportError:
1042 return {}
1043
1044 try:
1045 config = ic.IC()
1046 except ic.error:
1047 return {}
1048 proxies = {}
1049 # HTTP:
1050 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1051 try:
1052 value = config['HTTPProxyHost']
1053 except ic.error:
1054 pass
1055 else:
1056 proxies['http'] = 'http://%s' % value
1057 # FTP: XXXX To be done.
1058 # Gopher: XXXX To be done.
1059 return proxies
1060
Guido van Rossum4163e701998-08-06 13:39:09 +00001061else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001062 def getproxies():
1063 """Return a dictionary of scheme -> proxy server URL mappings.
1064
1065 Scan the environment for variables named <scheme>_proxy;
1066 this seems to be the standard convention. If you need a
1067 different way, you can pass a proxies dictionary to the
1068 [Fancy]URLopener constructor.
1069
1070 """
1071 proxies = {}
1072 for name, value in os.environ.items():
1073 name = string.lower(name)
1074 if value and name[-6:] == '_proxy':
1075 proxies[name[:-6]] = value
1076 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001077
1078
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001079# Test and time quote() and unquote()
1080def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001081 import time
1082 s = ''
1083 for i in range(256): s = s + chr(i)
1084 s = s*4
1085 t0 = time.time()
1086 qs = quote(s)
1087 uqs = unquote(qs)
1088 t1 = time.time()
1089 if uqs != s:
1090 print 'Wrong!'
1091 print `s`
1092 print `qs`
1093 print `uqs`
1094 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001095
1096
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001097def reporthook(blocknum, blocksize, totalsize):
1098 # Report during remote transfers
1099 print "Block number: %d, Block size: %d, Total size: %d" % (blocknum, blocksize, totalsize)
1100
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001101# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001102def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001103 if not args:
1104 args = [
1105 '/etc/passwd',
1106 'file:/etc/passwd',
1107 'file://localhost/etc/passwd',
1108 'ftp://ftp.python.org/etc/passwd',
1109## 'gopher://gopher.micro.umn.edu/1/',
1110 'http://www.python.org/index.html',
1111 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001112 if hasattr(URLopener, "open_https"):
1113 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001114 try:
1115 for url in args:
1116 print '-'*10, url, '-'*10
1117 fn, h = urlretrieve(url, None, reporthook)
1118 print fn, h
1119 if h:
1120 print '======'
1121 for k in h.keys(): print k + ':', h[k]
1122 print '======'
1123 fp = open(fn, 'rb')
1124 data = fp.read()
1125 del fp
1126 if '\r' in data:
1127 table = string.maketrans("", "")
1128 data = string.translate(data, table, "\r")
1129 print data
1130 fn, h = None, None
1131 print '-'*40
1132 finally:
1133 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001134
Guido van Rossum23490151998-06-25 02:39:00 +00001135def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001136 import getopt, sys
1137 try:
1138 opts, args = getopt.getopt(sys.argv[1:], "th")
1139 except getopt.error, msg:
1140 print msg
1141 print "Use -h for help"
1142 return
1143 t = 0
1144 for o, a in opts:
1145 if o == '-t':
1146 t = t + 1
1147 if o == '-h':
1148 print "Usage: python urllib.py [-t] [url ...]"
1149 print "-t runs self-test;",
1150 print "otherwise, contents of urls are printed"
1151 return
1152 if t:
1153 if t > 1:
1154 test1()
1155 test(args)
1156 else:
1157 if not args:
1158 print "Use -h for help"
1159 for url in args:
1160 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001161
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001162# Run test program when run as a script
1163if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001164 main()