blob: 7bc9f1789fb008c0191d33a0e2df05cb22c87192 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossum3c8484e1996-11-20 22:02:24 +000028import sys
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000029
30
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000031__version__ = '1.12' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000032
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000033MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000034
Jack Jansendc3e3f61995-12-15 13:22:13 +000035# Helper for non-unix systems
36if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000037 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000038elif os.name == 'nt':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000039 from nturl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000040else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000041 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000042 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000043 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000044 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000045
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000046# This really consists of two pieces:
47# (1) a class which handles opening of all sorts of URLs
48# (plus assorted utilities etc.)
49# (2) a set of functions for parsing URLs
50# XXX Should these be separated out into different modules?
51
52
53# Shortcut for basic usage
54_urlopener = None
Guido van Rossumbd013741996-12-10 16:00:28 +000055def urlopen(url, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000056 global _urlopener
57 if not _urlopener:
58 _urlopener = FancyURLopener()
59 if data is None:
60 return _urlopener.open(url)
61 else:
62 return _urlopener.open(url, data)
Guido van Rossum9ab96d41998-09-28 14:07:00 +000063def urlretrieve(url, filename=None, reporthook=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000064 global _urlopener
65 if not _urlopener:
66 _urlopener = FancyURLopener()
67 return _urlopener.retrieve(url, filename, reporthook)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000068def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000069 if _urlopener:
70 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000071
72
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000073ftpcache = {}
74class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000075 """Class to open URLs.
76 This is a class rather than just a subroutine because we may need
77 more than one set of global protocol-specific options.
78 Note -- this is a base class for those who don't want the
79 automatic handling of errors type 302 (relocated) and 401
80 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000081
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000082 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000083
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000084 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000085 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000086 if proxies is None:
87 proxies = getproxies()
88 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
89 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000090 self.key_file = x509.get('key_file')
91 self.cert_file = x509.get('cert_file')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000092 server_version = "Python-urllib/%s" % __version__
93 self.addheaders = [('User-agent', server_version)]
94 self.__tempfiles = []
95 self.__unlink = os.unlink # See cleanup()
96 self.tempcache = None
97 # Undocumented feature: if you assign {} to tempcache,
98 # it is used to cache files retrieved with
99 # self.retrieve(). This is not enabled by default
100 # since it does not work for changing documents (and I
101 # haven't got the logic to check expiration headers
102 # yet).
103 self.ftpcache = ftpcache
104 # Undocumented feature: you can use a different
105 # ftp cache by assigning to the .ftpcache member;
106 # in case you want logically independent URL openers
107 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000108
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000109 def __del__(self):
110 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000111
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000112 def close(self):
113 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000114
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000115 def cleanup(self):
116 # This code sometimes runs when the rest of this module
117 # has already been deleted, so it can't use any globals
118 # or import anything.
119 if self.__tempfiles:
120 for file in self.__tempfiles:
121 try:
122 self.__unlink(file)
123 except:
124 pass
125 del self.__tempfiles[:]
126 if self.tempcache:
127 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000128
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000129 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000130 """Add a header to be used by the HTTP interface only
131 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000132 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000133
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000134 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000135 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000136 """Use URLopener().open(file) instead of open(file, 'r')."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000137 fullurl = unwrap(fullurl)
138 if self.tempcache and self.tempcache.has_key(fullurl):
139 filename, headers = self.tempcache[fullurl]
140 fp = open(filename, 'rb')
141 return addinfourl(fp, headers, fullurl)
142 type, url = splittype(fullurl)
143 if not type: type = 'file'
144 if self.proxies.has_key(type):
145 proxy = self.proxies[type]
146 type, proxy = splittype(proxy)
147 host, selector = splithost(proxy)
148 url = (host, fullurl) # Signal special case to open_*()
149 name = 'open_' + type
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000150 self.type = type
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000151 if '-' in name:
152 # replace - with _
153 name = string.join(string.split(name, '-'), '_')
154 if not hasattr(self, name):
155 if data is None:
156 return self.open_unknown(fullurl)
157 else:
158 return self.open_unknown(fullurl, data)
159 try:
160 if data is None:
161 return getattr(self, name)(url)
162 else:
163 return getattr(self, name)(url, data)
164 except socket.error, msg:
165 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000166
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000167 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000168 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000169 type, url = splittype(fullurl)
170 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000171
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000172 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000173 def retrieve(self, url, filename=None, reporthook=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000174 """retrieve(url) returns (filename, None) for a local object
175 or (tempfilename, headers) for a remote object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000176 url = unwrap(url)
177 if self.tempcache and self.tempcache.has_key(url):
178 return self.tempcache[url]
179 type, url1 = splittype(url)
180 if not filename and (not type or type == 'file'):
181 try:
182 fp = self.open_local_file(url1)
183 hdrs = fp.info()
184 del fp
185 return url2pathname(splithost(url1)[1]), hdrs
186 except IOError, msg:
187 pass
188 fp = self.open(url)
189 headers = fp.info()
190 if not filename:
191 import tempfile
192 garbage, path = splittype(url)
193 garbage, path = splithost(path or "")
194 path, garbage = splitquery(path or "")
195 path, garbage = splitattr(path or "")
196 suffix = os.path.splitext(path)[1]
197 filename = tempfile.mktemp(suffix)
198 self.__tempfiles.append(filename)
199 result = filename, headers
200 if self.tempcache is not None:
201 self.tempcache[url] = result
202 tfp = open(filename, 'wb')
203 bs = 1024*8
204 size = -1
205 blocknum = 1
206 if reporthook:
207 if headers.has_key("content-length"):
208 size = int(headers["Content-Length"])
209 reporthook(0, bs, size)
210 block = fp.read(bs)
211 if reporthook:
212 reporthook(1, bs, size)
213 while block:
214 tfp.write(block)
215 block = fp.read(bs)
216 blocknum = blocknum + 1
217 if reporthook:
218 reporthook(blocknum, bs, size)
219 fp.close()
220 tfp.close()
221 del fp
222 del tfp
223 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000224
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000225 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000226
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000227 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000228 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000229 import httplib
230 user_passwd = None
231 if type(url) is type(""):
232 host, selector = splithost(url)
233 if host:
234 user_passwd, host = splituser(host)
235 host = unquote(host)
236 realhost = host
237 else:
238 host, selector = url
239 urltype, rest = splittype(selector)
240 url = rest
241 user_passwd = None
242 if string.lower(urltype) != 'http':
243 realhost = None
244 else:
245 realhost, rest = splithost(rest)
246 if realhost:
247 user_passwd, realhost = splituser(realhost)
248 if user_passwd:
249 selector = "%s://%s%s" % (urltype, realhost, rest)
250 #print "proxy via http:", host, selector
251 if not host: raise IOError, ('http error', 'no host given')
252 if user_passwd:
253 import base64
254 auth = string.strip(base64.encodestring(user_passwd))
255 else:
256 auth = None
257 h = httplib.HTTP(host)
258 if data is not None:
259 h.putrequest('POST', selector)
260 h.putheader('Content-type', 'application/x-www-form-urlencoded')
261 h.putheader('Content-length', '%d' % len(data))
262 else:
263 h.putrequest('GET', selector)
264 if auth: h.putheader('Authorization', 'Basic %s' % auth)
265 if realhost: h.putheader('Host', realhost)
266 for args in self.addheaders: apply(h.putheader, args)
267 h.endheaders()
268 if data is not None:
269 h.send(data + '\r\n')
270 errcode, errmsg, headers = h.getreply()
271 fp = h.getfile()
272 if errcode == 200:
273 return addinfourl(fp, headers, "http:" + url)
274 else:
275 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000276 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000277 else:
278 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000279
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000280 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000281 """Handle http errors.
282 Derived class can override this, or provide specific handlers
283 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000284 # First check if there's a specific handler for this error
285 name = 'http_error_%d' % errcode
286 if hasattr(self, name):
287 method = getattr(self, name)
288 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000289 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000290 else:
291 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000292 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000293 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000294
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000295 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000296 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000297 void = fp.read()
298 fp.close()
299 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000300
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000301 if hasattr(socket, "ssl"):
302 def open_https(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000303 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000304 import httplib
305 if type(url) is type(""):
306 host, selector = splithost(url)
307 user_passwd, host = splituser(host)
308 else:
309 host, selector = url
310 urltype, rest = splittype(selector)
311 if string.lower(urltype) == 'https':
312 realhost, rest = splithost(rest)
313 user_passwd, realhost = splituser(realhost)
314 if user_passwd:
315 selector = "%s://%s%s" % (urltype, realhost, rest)
316 print "proxy via https:", host, selector
317 if not host: raise IOError, ('https error', 'no host given')
318 if user_passwd:
319 import base64
320 auth = string.strip(base64.encodestring(user_passwd))
321 else:
322 auth = None
323 h = httplib.HTTPS(host, 0,
324 key_file=self.key_file,
325 cert_file=self.cert_file)
326 h.putrequest('GET', selector)
327 if auth: h.putheader('Authorization: Basic %s' % auth)
328 for args in self.addheaders: apply(h.putheader, args)
329 h.endheaders()
330 errcode, errmsg, headers = h.getreply()
331 fp = h.getfile()
332 if errcode == 200:
333 return addinfourl(fp, headers, url)
334 else:
335 return self.http_error(url, fp, errcode, errmsg, headers)
336
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000337 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000338 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000339 import gopherlib
340 host, selector = splithost(url)
341 if not host: raise IOError, ('gopher error', 'no host given')
342 host = unquote(host)
343 type, selector = splitgophertype(selector)
344 selector, query = splitquery(selector)
345 selector = unquote(selector)
346 if query:
347 query = unquote(query)
348 fp = gopherlib.send_query(selector, query, host)
349 else:
350 fp = gopherlib.send_selector(selector, host)
351 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000352
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000353 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000354 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000355 if url[:2] == '//' and url[2:3] != '/':
356 return self.open_ftp(url)
357 else:
358 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000359
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000360 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000361 """Use local file."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000362 import mimetypes, mimetools, StringIO
363 mtype = mimetypes.guess_type(url)[0]
364 headers = mimetools.Message(StringIO.StringIO(
365 'Content-Type: %s\n' % (mtype or 'text/plain')))
366 host, file = splithost(url)
367 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000368 urlfile = file
369 if file[:1] == '/':
370 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000371 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000372 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000373 host, port = splitport(host)
374 if not port \
375 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000376 urlfile = file
377 if file[:1] == '/':
378 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000379 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000380 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000381 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000382
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000383 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000384 """Use FTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000385 host, path = splithost(url)
386 if not host: raise IOError, ('ftp error', 'no host given')
387 host, port = splitport(host)
388 user, host = splituser(host)
389 if user: user, passwd = splitpasswd(user)
390 else: passwd = None
391 host = unquote(host)
392 user = unquote(user or '')
393 passwd = unquote(passwd or '')
394 host = socket.gethostbyname(host)
395 if not port:
396 import ftplib
397 port = ftplib.FTP_PORT
398 else:
399 port = int(port)
400 path, attrs = splitattr(path)
401 path = unquote(path)
402 dirs = string.splitfields(path, '/')
403 dirs, file = dirs[:-1], dirs[-1]
404 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000405 if dirs and not dirs[0]: dirs[0] = '/'
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000406 key = (user, host, port, string.joinfields(dirs, '/'))
407 # XXX thread unsafe!
408 if len(self.ftpcache) > MAXFTPCACHE:
409 # Prune the cache, rather arbitrarily
410 for k in self.ftpcache.keys():
411 if k != key:
412 v = self.ftpcache[k]
413 del self.ftpcache[k]
414 v.close()
415 try:
416 if not self.ftpcache.has_key(key):
417 self.ftpcache[key] = \
418 ftpwrapper(user, passwd, host, port, dirs)
419 if not file: type = 'D'
420 else: type = 'I'
421 for attr in attrs:
422 attr, value = splitvalue(attr)
423 if string.lower(attr) == 'type' and \
424 value in ('a', 'A', 'i', 'I', 'd', 'D'):
425 type = string.upper(value)
426 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
427 if retrlen is not None and retrlen >= 0:
428 import mimetools, StringIO
429 headers = mimetools.Message(StringIO.StringIO(
430 'Content-Length: %d\n' % retrlen))
431 else:
432 headers = noheaders()
433 return addinfourl(fp, headers, "ftp:" + url)
434 except ftperrors(), msg:
435 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000436
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000437 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000438 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000439 # ignore POSTed data
440 #
441 # syntax of data URLs:
442 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
443 # mediatype := [ type "/" subtype ] *( ";" parameter )
444 # data := *urlchar
445 # parameter := attribute "=" value
446 import StringIO, mimetools, time
447 try:
448 [type, data] = string.split(url, ',', 1)
449 except ValueError:
450 raise IOError, ('data error', 'bad data URL')
451 if not type:
452 type = 'text/plain;charset=US-ASCII'
453 semi = string.rfind(type, ';')
454 if semi >= 0 and '=' not in type[semi:]:
455 encoding = type[semi+1:]
456 type = type[:semi]
457 else:
458 encoding = ''
459 msg = []
460 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
461 time.gmtime(time.time())))
462 msg.append('Content-type: %s' % type)
463 if encoding == 'base64':
464 import base64
465 data = base64.decodestring(data)
466 else:
467 data = unquote(data)
468 msg.append('Content-length: %d' % len(data))
469 msg.append('')
470 msg.append(data)
471 msg = string.join(msg, '\n')
472 f = StringIO.StringIO(msg)
473 headers = mimetools.Message(f, 0)
474 f.fileno = None # needed for addinfourl
475 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000476
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000477
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000478class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000479 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000480
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000481 def __init__(self, *args):
482 apply(URLopener.__init__, (self,) + args)
483 self.auth_cache = {}
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000484
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000485 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000486 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000487 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000488
Guido van Rossume7b146f2000-02-04 15:28:42 +0000489 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
490 """Error 302 -- relocated (temporarily)."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000491 # XXX The server can force infinite recursion here!
492 if headers.has_key('location'):
493 newurl = headers['location']
494 elif headers.has_key('uri'):
495 newurl = headers['uri']
496 else:
497 return
498 void = fp.read()
499 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000500 # In case the server sent a relative URL, join with original:
501 newurl = basejoin("http:" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000502 if data is None:
503 return self.open(newurl)
504 else:
505 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000506
Guido van Rossume7b146f2000-02-04 15:28:42 +0000507 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
508 """Error 301 -- also relocated (permanently)."""
509 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000510
Guido van Rossume7b146f2000-02-04 15:28:42 +0000511 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
512 """Error 401 -- authentication required.
513 See this URL for a description of the basic authentication scheme:
514 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000515 if headers.has_key('www-authenticate'):
516 stuff = headers['www-authenticate']
517 import re
518 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
519 if match:
520 scheme, realm = match.groups()
521 if string.lower(scheme) == 'basic':
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000522 name = 'retry_' + self.type + '_basic_auth'
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000523 if data is None:
524 return getattr(self,name)(url, realm)
525 else:
526 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000527
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000528 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000529 host, selector = splithost(url)
530 i = string.find(host, '@') + 1
531 host = host[i:]
532 user, passwd = self.get_user_passwd(host, realm, i)
533 if not (user or passwd): return None
534 host = user + ':' + passwd + '@' + host
535 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000536 if data is None:
537 return self.open(newurl)
538 else:
539 return self.open(newurl, data)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000540
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000541 def retry_https_basic_auth(self, url, realm, data=None):
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000542 host, selector = splithost(url)
543 i = string.find(host, '@') + 1
544 host = host[i:]
545 user, passwd = self.get_user_passwd(host, realm, i)
546 if not (user or passwd): return None
547 host = user + ':' + passwd + '@' + host
548 newurl = '//' + host + selector
549 return self.open_https(newurl)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000550
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000551 def get_user_passwd(self, host, realm, clear_cache = 0):
552 key = realm + '@' + string.lower(host)
553 if self.auth_cache.has_key(key):
554 if clear_cache:
555 del self.auth_cache[key]
556 else:
557 return self.auth_cache[key]
558 user, passwd = self.prompt_user_passwd(host, realm)
559 if user or passwd: self.auth_cache[key] = (user, passwd)
560 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000561
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000562 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000563 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000564 import getpass
565 try:
566 user = raw_input("Enter username for %s at %s: " % (realm,
567 host))
568 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
569 (user, realm, host))
570 return user, passwd
571 except KeyboardInterrupt:
572 print
573 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000574
575
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000576# Utility functions
577
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000578_localhost = None
579def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000580 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000581 global _localhost
582 if not _localhost:
583 _localhost = socket.gethostbyname('localhost')
584 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000585
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000586_thishost = None
587def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000588 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000589 global _thishost
590 if not _thishost:
591 _thishost = socket.gethostbyname(socket.gethostname())
592 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000593
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000594_ftperrors = None
595def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000596 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000597 global _ftperrors
598 if not _ftperrors:
599 import ftplib
600 _ftperrors = ftplib.all_errors
601 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000602
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000603_noheaders = None
604def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000605 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000606 global _noheaders
607 if not _noheaders:
608 import mimetools
609 import StringIO
610 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
611 _noheaders.fp.close() # Recycle file descriptor
612 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000613
614
615# Utility classes
616
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000617class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000618 """Class used by open_ftp() for cache of open FTP connections."""
619
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000620 def __init__(self, user, passwd, host, port, dirs):
621 self.user = user
622 self.passwd = passwd
623 self.host = host
624 self.port = port
625 self.dirs = dirs
626 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000627
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000628 def init(self):
629 import ftplib
630 self.busy = 0
631 self.ftp = ftplib.FTP()
632 self.ftp.connect(self.host, self.port)
633 self.ftp.login(self.user, self.passwd)
634 for dir in self.dirs:
635 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000636
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000637 def retrfile(self, file, type):
638 import ftplib
639 self.endtransfer()
640 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
641 else: cmd = 'TYPE ' + type; isdir = 0
642 try:
643 self.ftp.voidcmd(cmd)
644 except ftplib.all_errors:
645 self.init()
646 self.ftp.voidcmd(cmd)
647 conn = None
648 if file and not isdir:
649 # Use nlst to see if the file exists at all
650 try:
651 self.ftp.nlst(file)
652 except ftplib.error_perm, reason:
653 raise IOError, ('ftp error', reason), sys.exc_info()[2]
654 # Restore the transfer mode!
655 self.ftp.voidcmd(cmd)
656 # Try to retrieve as a file
657 try:
658 cmd = 'RETR ' + file
659 conn = self.ftp.ntransfercmd(cmd)
660 except ftplib.error_perm, reason:
661 if reason[:3] != '550':
662 raise IOError, ('ftp error', reason), sys.exc_info()[2]
663 if not conn:
664 # Set transfer mode to ASCII!
665 self.ftp.voidcmd('TYPE A')
666 # Try a directory listing
667 if file: cmd = 'LIST ' + file
668 else: cmd = 'LIST'
669 conn = self.ftp.ntransfercmd(cmd)
670 self.busy = 1
671 # Pass back both a suitably decorated object and a retrieval length
672 return (addclosehook(conn[0].makefile('rb'),
673 self.endtransfer), conn[1])
674 def endtransfer(self):
675 if not self.busy:
676 return
677 self.busy = 0
678 try:
679 self.ftp.voidresp()
680 except ftperrors():
681 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000682
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000683 def close(self):
684 self.endtransfer()
685 try:
686 self.ftp.close()
687 except ftperrors():
688 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000689
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000690class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000691 """Base class for addinfo and addclosehook."""
692
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000693 def __init__(self, fp):
694 self.fp = fp
695 self.read = self.fp.read
696 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000697 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
698 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000699
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000700 def __repr__(self):
701 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
702 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000703
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000704 def close(self):
705 self.read = None
706 self.readline = None
707 self.readlines = None
708 self.fileno = None
709 if self.fp: self.fp.close()
710 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000711
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000712class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000713 """Class to add a close hook to an open file."""
714
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000715 def __init__(self, fp, closehook, *hookargs):
716 addbase.__init__(self, fp)
717 self.closehook = closehook
718 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000719
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000720 def close(self):
721 if self.closehook:
722 apply(self.closehook, self.hookargs)
723 self.closehook = None
724 self.hookargs = None
725 addbase.close(self)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000726
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000727class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000728 """class to add an info() method to an open file."""
729
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000730 def __init__(self, fp, headers):
731 addbase.__init__(self, fp)
732 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000733
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000734 def info(self):
735 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000736
Guido van Rossume6ad8911996-09-10 17:02:56 +0000737class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000738 """class to add info() and geturl() methods to an open file."""
739
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000740 def __init__(self, fp, headers, url):
741 addbase.__init__(self, fp)
742 self.headers = headers
743 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000744
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000745 def info(self):
746 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000747
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000748 def geturl(self):
749 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000750
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000751
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000752def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000753 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000754 type, path = splittype(url)
755 if type:
756 # if url is complete (i.e., it contains a type), return it
757 return url
758 host, path = splithost(path)
759 type, basepath = splittype(base) # inherit type from base
760 if host:
761 # if url contains host, just inherit type
762 if type: return type + '://' + host + path
763 else:
764 # no type inherited, so url must have started with //
765 # just return it
766 return url
767 host, basepath = splithost(basepath) # inherit host
768 basepath, basetag = splittag(basepath) # remove extraneuous cruft
769 basepath, basequery = splitquery(basepath) # idem
770 if path[:1] != '/':
771 # non-absolute path name
772 if path[:1] in ('#', '?'):
773 # path is just a tag or query, attach to basepath
774 i = len(basepath)
775 else:
776 # else replace last component
777 i = string.rfind(basepath, '/')
778 if i < 0:
779 # basepath not absolute
780 if host:
781 # host present, make absolute
782 basepath = '/'
783 else:
784 # else keep non-absolute
785 basepath = ''
786 else:
787 # remove last file component
788 basepath = basepath[:i+1]
789 # Interpret ../ (important because of symlinks)
790 while basepath and path[:3] == '../':
791 path = path[3:]
792 i = string.rfind(basepath[:-1], '/')
793 if i > 0:
794 basepath = basepath[:i+1]
795 elif i == 0:
796 basepath = '/'
797 break
798 else:
799 basepath = ''
800
801 path = basepath + path
802 if type and host: return type + '://' + host + path
803 elif type: return type + ':' + path
804 elif host: return '//' + host + path # don't know what this means
805 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000806
807
Guido van Rossum7c395db1994-07-04 22:14:49 +0000808# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000809# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000810# splittype('type:opaquestring') --> 'type', 'opaquestring'
811# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000812# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
813# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000814# splitport('host:port') --> 'host', 'port'
815# splitquery('/path?query') --> '/path', 'query'
816# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000817# splitattr('/path;attr1=value1;attr2=value2;...') ->
818# '/path', ['attr1=value1', 'attr2=value2', ...]
819# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000820# splitgophertype('/Xselector') --> 'X', 'selector'
821# unquote('abc%20def') -> 'abc def'
822# quote('abc def') -> 'abc%20def')
823
824def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000825 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000826 url = string.strip(url)
827 if url[:1] == '<' and url[-1:] == '>':
828 url = string.strip(url[1:-1])
829 if url[:4] == 'URL:': url = string.strip(url[4:])
830 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000831
Guido van Rossum332e1441997-09-29 23:23:46 +0000832_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000833def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000834 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000835 global _typeprog
836 if _typeprog is None:
837 import re
838 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000839
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000840 match = _typeprog.match(url)
841 if match:
842 scheme = match.group(1)
843 return scheme, url[len(scheme) + 1:]
844 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000845
Guido van Rossum332e1441997-09-29 23:23:46 +0000846_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000847def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000848 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000849 global _hostprog
850 if _hostprog is None:
851 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000852 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000853
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000854 match = _hostprog.match(url)
855 if match: return match.group(1, 2)
856 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000857
Guido van Rossum332e1441997-09-29 23:23:46 +0000858_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000859def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000860 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000861 global _userprog
862 if _userprog is None:
863 import re
864 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000865
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000866 match = _userprog.match(host)
867 if match: return match.group(1, 2)
868 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000869
Guido van Rossum332e1441997-09-29 23:23:46 +0000870_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000871def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000872 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000873 global _passwdprog
874 if _passwdprog is None:
875 import re
876 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000877
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000878 match = _passwdprog.match(user)
879 if match: return match.group(1, 2)
880 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000881
Guido van Rossume7b146f2000-02-04 15:28:42 +0000882# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000883_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000884def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000885 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000886 global _portprog
887 if _portprog is None:
888 import re
889 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000890
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000891 match = _portprog.match(host)
892 if match: return match.group(1, 2)
893 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000894
Guido van Rossum332e1441997-09-29 23:23:46 +0000895_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000896def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000897 """Split host and port, returning numeric port.
898 Return given default port if no ':' found; defaults to -1.
899 Return numerical port if a valid number are found after ':'.
900 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000901 global _nportprog
902 if _nportprog is None:
903 import re
904 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000905
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000906 match = _nportprog.match(host)
907 if match:
908 host, port = match.group(1, 2)
909 try:
910 if not port: raise string.atoi_error, "no digits"
911 nport = string.atoi(port)
912 except string.atoi_error:
913 nport = None
914 return host, nport
915 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +0000916
Guido van Rossum332e1441997-09-29 23:23:46 +0000917_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000918def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000919 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000920 global _queryprog
921 if _queryprog is None:
922 import re
923 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000924
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000925 match = _queryprog.match(url)
926 if match: return match.group(1, 2)
927 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000928
Guido van Rossum332e1441997-09-29 23:23:46 +0000929_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000930def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000931 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000932 global _tagprog
933 if _tagprog is None:
934 import re
935 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000936
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000937 match = _tagprog.match(url)
938 if match: return match.group(1, 2)
939 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000940
Guido van Rossum7c395db1994-07-04 22:14:49 +0000941def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000942 """splitattr('/path;attr1=value1;attr2=value2;...') ->
943 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000944 words = string.splitfields(url, ';')
945 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +0000946
Guido van Rossum332e1441997-09-29 23:23:46 +0000947_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000948def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000949 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000950 global _valueprog
951 if _valueprog is None:
952 import re
953 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000954
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000955 match = _valueprog.match(attr)
956 if match: return match.group(1, 2)
957 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000958
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000959def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000960 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000961 if selector[:1] == '/' and selector[1:2]:
962 return selector[1], selector[2:]
963 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000964
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000965def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000966 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000967 mychr = chr
968 myatoi = string.atoi
969 list = string.split(s, '%')
970 res = [list[0]]
971 myappend = res.append
972 del list[0]
973 for item in list:
974 if item[1:2]:
975 try:
976 myappend(mychr(myatoi(item[:2], 16))
977 + item[2:])
978 except:
979 myappend('%' + item)
980 else:
981 myappend('%' + item)
982 return string.join(res, "")
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000983
Guido van Rossum0564e121996-12-13 14:47:36 +0000984def unquote_plus(s):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000985 if '+' in s:
986 # replace '+' with ' '
987 s = string.join(string.split(s, '+'), ' ')
988 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +0000989
Guido van Rossum3bb54481994-08-29 10:52:58 +0000990always_safe = string.letters + string.digits + '_,.-'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000991def quote(s, safe = '/'):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000992 """quote('abc def') -> 'abc%20def')."""
Guido van Rossum0dee4ee1999-06-09 15:14:50 +0000993 # XXX Can speed this up an order of magnitude
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000994 safe = always_safe + safe
995 res = list(s)
996 for i in range(len(res)):
997 c = res[i]
998 if c not in safe:
999 res[i] = '%%%02x' % ord(c)
1000 return string.joinfields(res, '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001001
Guido van Rossum0564e121996-12-13 14:47:36 +00001002def quote_plus(s, safe = '/'):
Guido van Rossum0dee4ee1999-06-09 15:14:50 +00001003 # XXX Can speed this up an order of magnitude
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001004 if ' ' in s:
1005 # replace ' ' with '+'
1006 l = string.split(s, ' ')
1007 for i in range(len(l)):
1008 l[i] = quote(l[i], safe)
1009 return string.join(l, '+')
1010 else:
1011 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001012
Guido van Rossum810a3391998-07-22 21:33:23 +00001013def urlencode(dict):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001014 """Encode a dictionary of form entries into a URL query string."""
1015 l = []
1016 for k, v in dict.items():
1017 k = quote_plus(str(k))
1018 v = quote_plus(str(v))
1019 l.append(k + '=' + v)
1020 return string.join(l, '&')
Guido van Rossum810a3391998-07-22 21:33:23 +00001021
Guido van Rossum442e7201996-03-20 15:33:11 +00001022
1023# Proxy handling
Guido van Rossum4163e701998-08-06 13:39:09 +00001024if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001025 def getproxies():
1026 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001027
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001028 By convention the mac uses Internet Config to store
1029 proxies. An HTTP proxy, for instance, is stored under
1030 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001031
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001032 """
1033 try:
1034 import ic
1035 except ImportError:
1036 return {}
1037
1038 try:
1039 config = ic.IC()
1040 except ic.error:
1041 return {}
1042 proxies = {}
1043 # HTTP:
1044 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1045 try:
1046 value = config['HTTPProxyHost']
1047 except ic.error:
1048 pass
1049 else:
1050 proxies['http'] = 'http://%s' % value
1051 # FTP: XXXX To be done.
1052 # Gopher: XXXX To be done.
1053 return proxies
1054
Guido van Rossum4163e701998-08-06 13:39:09 +00001055else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001056 def getproxies():
1057 """Return a dictionary of scheme -> proxy server URL mappings.
1058
1059 Scan the environment for variables named <scheme>_proxy;
1060 this seems to be the standard convention. If you need a
1061 different way, you can pass a proxies dictionary to the
1062 [Fancy]URLopener constructor.
1063
1064 """
1065 proxies = {}
1066 for name, value in os.environ.items():
1067 name = string.lower(name)
1068 if value and name[-6:] == '_proxy':
1069 proxies[name[:-6]] = value
1070 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001071
1072
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001073# Test and time quote() and unquote()
1074def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001075 import time
1076 s = ''
1077 for i in range(256): s = s + chr(i)
1078 s = s*4
1079 t0 = time.time()
1080 qs = quote(s)
1081 uqs = unquote(qs)
1082 t1 = time.time()
1083 if uqs != s:
1084 print 'Wrong!'
1085 print `s`
1086 print `qs`
1087 print `uqs`
1088 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001089
1090
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001091def reporthook(blocknum, blocksize, totalsize):
1092 # Report during remote transfers
1093 print "Block number: %d, Block size: %d, Total size: %d" % (blocknum, blocksize, totalsize)
1094
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001095# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001096def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001097 if not args:
1098 args = [
1099 '/etc/passwd',
1100 'file:/etc/passwd',
1101 'file://localhost/etc/passwd',
1102 'ftp://ftp.python.org/etc/passwd',
1103## 'gopher://gopher.micro.umn.edu/1/',
1104 'http://www.python.org/index.html',
1105 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001106 if hasattr(URLopener, "open_https"):
1107 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001108 try:
1109 for url in args:
1110 print '-'*10, url, '-'*10
1111 fn, h = urlretrieve(url, None, reporthook)
1112 print fn, h
1113 if h:
1114 print '======'
1115 for k in h.keys(): print k + ':', h[k]
1116 print '======'
1117 fp = open(fn, 'rb')
1118 data = fp.read()
1119 del fp
1120 if '\r' in data:
1121 table = string.maketrans("", "")
1122 data = string.translate(data, table, "\r")
1123 print data
1124 fn, h = None, None
1125 print '-'*40
1126 finally:
1127 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001128
Guido van Rossum23490151998-06-25 02:39:00 +00001129def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001130 import getopt, sys
1131 try:
1132 opts, args = getopt.getopt(sys.argv[1:], "th")
1133 except getopt.error, msg:
1134 print msg
1135 print "Use -h for help"
1136 return
1137 t = 0
1138 for o, a in opts:
1139 if o == '-t':
1140 t = t + 1
1141 if o == '-h':
1142 print "Usage: python urllib.py [-t] [url ...]"
1143 print "-t runs self-test;",
1144 print "otherwise, contents of urls are printed"
1145 return
1146 if t:
1147 if t > 1:
1148 test1()
1149 test(args)
1150 else:
1151 if not args:
1152 print "Use -h for help"
1153 for url in args:
1154 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001155
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001156# Run test program when run as a script
1157if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001158 main()