blob: 9400757db6a796e7e3753b502d93a3df22f6db90 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossum3c8484e1996-11-20 22:02:24 +000028import sys
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000029
30
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000031__version__ = '1.12' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000032
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000033MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000034
Jack Jansendc3e3f61995-12-15 13:22:13 +000035# Helper for non-unix systems
36if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000037 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000038elif os.name == 'nt':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000039 from nturl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000040else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000041 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000042 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000043 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000044 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000045
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000046# This really consists of two pieces:
47# (1) a class which handles opening of all sorts of URLs
48# (plus assorted utilities etc.)
49# (2) a set of functions for parsing URLs
50# XXX Should these be separated out into different modules?
51
52
53# Shortcut for basic usage
54_urlopener = None
Guido van Rossumbd013741996-12-10 16:00:28 +000055def urlopen(url, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000056 global _urlopener
57 if not _urlopener:
58 _urlopener = FancyURLopener()
59 if data is None:
60 return _urlopener.open(url)
61 else:
62 return _urlopener.open(url, data)
Guido van Rossum9ab96d41998-09-28 14:07:00 +000063def urlretrieve(url, filename=None, reporthook=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000064 global _urlopener
65 if not _urlopener:
66 _urlopener = FancyURLopener()
67 return _urlopener.retrieve(url, filename, reporthook)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000068def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000069 if _urlopener:
70 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000071
72
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000073ftpcache = {}
74class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000075 """Class to open URLs.
76 This is a class rather than just a subroutine because we may need
77 more than one set of global protocol-specific options.
78 Note -- this is a base class for those who don't want the
79 automatic handling of errors type 302 (relocated) and 401
80 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000081
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000082 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000083
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000084 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000085 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000086 if proxies is None:
87 proxies = getproxies()
88 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
89 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000090 self.key_file = x509.get('key_file')
91 self.cert_file = x509.get('cert_file')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000092 server_version = "Python-urllib/%s" % __version__
93 self.addheaders = [('User-agent', server_version)]
94 self.__tempfiles = []
95 self.__unlink = os.unlink # See cleanup()
96 self.tempcache = None
97 # Undocumented feature: if you assign {} to tempcache,
98 # it is used to cache files retrieved with
99 # self.retrieve(). This is not enabled by default
100 # since it does not work for changing documents (and I
101 # haven't got the logic to check expiration headers
102 # yet).
103 self.ftpcache = ftpcache
104 # Undocumented feature: you can use a different
105 # ftp cache by assigning to the .ftpcache member;
106 # in case you want logically independent URL openers
107 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000108
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000109 def __del__(self):
110 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000111
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000112 def close(self):
113 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000114
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000115 def cleanup(self):
116 # This code sometimes runs when the rest of this module
117 # has already been deleted, so it can't use any globals
118 # or import anything.
119 if self.__tempfiles:
120 for file in self.__tempfiles:
121 try:
122 self.__unlink(file)
123 except:
124 pass
125 del self.__tempfiles[:]
126 if self.tempcache:
127 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000128
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000129 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000130 """Add a header to be used by the HTTP interface only
131 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000132 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000133
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000134 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000135 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000136 """Use URLopener().open(file) instead of open(file, 'r')."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000137 fullurl = unwrap(fullurl)
138 if self.tempcache and self.tempcache.has_key(fullurl):
139 filename, headers = self.tempcache[fullurl]
140 fp = open(filename, 'rb')
141 return addinfourl(fp, headers, fullurl)
142 type, url = splittype(fullurl)
143 if not type: type = 'file'
144 if self.proxies.has_key(type):
145 proxy = self.proxies[type]
146 type, proxy = splittype(proxy)
147 host, selector = splithost(proxy)
148 url = (host, fullurl) # Signal special case to open_*()
149 name = 'open_' + type
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000150 self.type = type
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000151 if '-' in name:
152 # replace - with _
153 name = string.join(string.split(name, '-'), '_')
154 if not hasattr(self, name):
155 if data is None:
156 return self.open_unknown(fullurl)
157 else:
158 return self.open_unknown(fullurl, data)
159 try:
160 if data is None:
161 return getattr(self, name)(url)
162 else:
163 return getattr(self, name)(url, data)
164 except socket.error, msg:
165 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000166
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000167 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000168 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000169 type, url = splittype(fullurl)
170 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000171
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000172 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000173 def retrieve(self, url, filename=None, reporthook=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000174 """retrieve(url) returns (filename, None) for a local object
175 or (tempfilename, headers) for a remote object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000176 url = unwrap(url)
177 if self.tempcache and self.tempcache.has_key(url):
178 return self.tempcache[url]
179 type, url1 = splittype(url)
180 if not filename and (not type or type == 'file'):
181 try:
182 fp = self.open_local_file(url1)
183 hdrs = fp.info()
184 del fp
185 return url2pathname(splithost(url1)[1]), hdrs
186 except IOError, msg:
187 pass
188 fp = self.open(url)
189 headers = fp.info()
190 if not filename:
191 import tempfile
192 garbage, path = splittype(url)
193 garbage, path = splithost(path or "")
194 path, garbage = splitquery(path or "")
195 path, garbage = splitattr(path or "")
196 suffix = os.path.splitext(path)[1]
197 filename = tempfile.mktemp(suffix)
198 self.__tempfiles.append(filename)
199 result = filename, headers
200 if self.tempcache is not None:
201 self.tempcache[url] = result
202 tfp = open(filename, 'wb')
203 bs = 1024*8
204 size = -1
205 blocknum = 1
206 if reporthook:
207 if headers.has_key("content-length"):
208 size = int(headers["Content-Length"])
209 reporthook(0, bs, size)
210 block = fp.read(bs)
211 if reporthook:
212 reporthook(1, bs, size)
213 while block:
214 tfp.write(block)
215 block = fp.read(bs)
216 blocknum = blocknum + 1
217 if reporthook:
218 reporthook(blocknum, bs, size)
219 fp.close()
220 tfp.close()
221 del fp
222 del tfp
223 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000224
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000225 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000226
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000227 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000228 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000229 import httplib
230 user_passwd = None
231 if type(url) is type(""):
232 host, selector = splithost(url)
233 if host:
234 user_passwd, host = splituser(host)
235 host = unquote(host)
236 realhost = host
237 else:
238 host, selector = url
239 urltype, rest = splittype(selector)
240 url = rest
241 user_passwd = None
242 if string.lower(urltype) != 'http':
243 realhost = None
244 else:
245 realhost, rest = splithost(rest)
246 if realhost:
247 user_passwd, realhost = splituser(realhost)
248 if user_passwd:
249 selector = "%s://%s%s" % (urltype, realhost, rest)
250 #print "proxy via http:", host, selector
251 if not host: raise IOError, ('http error', 'no host given')
252 if user_passwd:
253 import base64
254 auth = string.strip(base64.encodestring(user_passwd))
255 else:
256 auth = None
257 h = httplib.HTTP(host)
258 if data is not None:
259 h.putrequest('POST', selector)
260 h.putheader('Content-type', 'application/x-www-form-urlencoded')
261 h.putheader('Content-length', '%d' % len(data))
262 else:
263 h.putrequest('GET', selector)
264 if auth: h.putheader('Authorization', 'Basic %s' % auth)
265 if realhost: h.putheader('Host', realhost)
266 for args in self.addheaders: apply(h.putheader, args)
267 h.endheaders()
268 if data is not None:
269 h.send(data + '\r\n')
270 errcode, errmsg, headers = h.getreply()
271 fp = h.getfile()
272 if errcode == 200:
273 return addinfourl(fp, headers, "http:" + url)
274 else:
275 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000276 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000277 else:
278 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000279
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000280 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000281 """Handle http errors.
282 Derived class can override this, or provide specific handlers
283 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000284 # First check if there's a specific handler for this error
285 name = 'http_error_%d' % errcode
286 if hasattr(self, name):
287 method = getattr(self, name)
288 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000289 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000290 else:
291 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000292 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000293 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000294
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000295 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000296 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000297 void = fp.read()
298 fp.close()
299 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000300
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000301 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000302 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000303 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000304 import httplib
305 if type(url) is type(""):
306 host, selector = splithost(url)
307 user_passwd, host = splituser(host)
308 else:
309 host, selector = url
310 urltype, rest = splittype(selector)
311 if string.lower(urltype) == 'https':
312 realhost, rest = splithost(rest)
313 user_passwd, realhost = splituser(realhost)
314 if user_passwd:
315 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000316 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000317 if not host: raise IOError, ('https error', 'no host given')
318 if user_passwd:
319 import base64
320 auth = string.strip(base64.encodestring(user_passwd))
321 else:
322 auth = None
323 h = httplib.HTTPS(host, 0,
324 key_file=self.key_file,
325 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000326 if data is not None:
327 h.putrequest('POST', selector)
328 h.putheader('Content-type',
329 'application/x-www-form-urlencoded')
330 h.putheader('Content-length', '%d' % len(data))
331 else:
332 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000333 if auth: h.putheader('Authorization: Basic %s' % auth)
334 for args in self.addheaders: apply(h.putheader, args)
335 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000336 if data is not None:
337 h.send(data + '\r\n')
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000338 errcode, errmsg, headers = h.getreply()
339 fp = h.getfile()
340 if errcode == 200:
341 return addinfourl(fp, headers, url)
342 else:
343 return self.http_error(url, fp, errcode, errmsg, headers)
344
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000345 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000346 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000347 import gopherlib
348 host, selector = splithost(url)
349 if not host: raise IOError, ('gopher error', 'no host given')
350 host = unquote(host)
351 type, selector = splitgophertype(selector)
352 selector, query = splitquery(selector)
353 selector = unquote(selector)
354 if query:
355 query = unquote(query)
356 fp = gopherlib.send_query(selector, query, host)
357 else:
358 fp = gopherlib.send_selector(selector, host)
359 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000360
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000361 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000362 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000363 if url[:2] == '//' and url[2:3] != '/':
364 return self.open_ftp(url)
365 else:
366 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000367
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000368 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000369 """Use local file."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000370 import mimetypes, mimetools, StringIO
371 mtype = mimetypes.guess_type(url)[0]
372 headers = mimetools.Message(StringIO.StringIO(
373 'Content-Type: %s\n' % (mtype or 'text/plain')))
374 host, file = splithost(url)
375 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000376 urlfile = file
377 if file[:1] == '/':
378 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000379 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000380 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000381 host, port = splitport(host)
382 if not port \
383 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000384 urlfile = file
385 if file[:1] == '/':
386 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000387 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000388 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000389 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000390
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000391 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000392 """Use FTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000393 host, path = splithost(url)
394 if not host: raise IOError, ('ftp error', 'no host given')
395 host, port = splitport(host)
396 user, host = splituser(host)
397 if user: user, passwd = splitpasswd(user)
398 else: passwd = None
399 host = unquote(host)
400 user = unquote(user or '')
401 passwd = unquote(passwd or '')
402 host = socket.gethostbyname(host)
403 if not port:
404 import ftplib
405 port = ftplib.FTP_PORT
406 else:
407 port = int(port)
408 path, attrs = splitattr(path)
409 path = unquote(path)
410 dirs = string.splitfields(path, '/')
411 dirs, file = dirs[:-1], dirs[-1]
412 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000413 if dirs and not dirs[0]: dirs[0] = '/'
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000414 key = (user, host, port, string.joinfields(dirs, '/'))
415 # XXX thread unsafe!
416 if len(self.ftpcache) > MAXFTPCACHE:
417 # Prune the cache, rather arbitrarily
418 for k in self.ftpcache.keys():
419 if k != key:
420 v = self.ftpcache[k]
421 del self.ftpcache[k]
422 v.close()
423 try:
424 if not self.ftpcache.has_key(key):
425 self.ftpcache[key] = \
426 ftpwrapper(user, passwd, host, port, dirs)
427 if not file: type = 'D'
428 else: type = 'I'
429 for attr in attrs:
430 attr, value = splitvalue(attr)
431 if string.lower(attr) == 'type' and \
432 value in ('a', 'A', 'i', 'I', 'd', 'D'):
433 type = string.upper(value)
434 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
435 if retrlen is not None and retrlen >= 0:
436 import mimetools, StringIO
437 headers = mimetools.Message(StringIO.StringIO(
438 'Content-Length: %d\n' % retrlen))
439 else:
440 headers = noheaders()
441 return addinfourl(fp, headers, "ftp:" + url)
442 except ftperrors(), msg:
443 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000444
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000445 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000446 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000447 # ignore POSTed data
448 #
449 # syntax of data URLs:
450 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
451 # mediatype := [ type "/" subtype ] *( ";" parameter )
452 # data := *urlchar
453 # parameter := attribute "=" value
454 import StringIO, mimetools, time
455 try:
456 [type, data] = string.split(url, ',', 1)
457 except ValueError:
458 raise IOError, ('data error', 'bad data URL')
459 if not type:
460 type = 'text/plain;charset=US-ASCII'
461 semi = string.rfind(type, ';')
462 if semi >= 0 and '=' not in type[semi:]:
463 encoding = type[semi+1:]
464 type = type[:semi]
465 else:
466 encoding = ''
467 msg = []
468 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
469 time.gmtime(time.time())))
470 msg.append('Content-type: %s' % type)
471 if encoding == 'base64':
472 import base64
473 data = base64.decodestring(data)
474 else:
475 data = unquote(data)
476 msg.append('Content-length: %d' % len(data))
477 msg.append('')
478 msg.append(data)
479 msg = string.join(msg, '\n')
480 f = StringIO.StringIO(msg)
481 headers = mimetools.Message(f, 0)
482 f.fileno = None # needed for addinfourl
483 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000484
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000485
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000486class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000487 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000488
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000489 def __init__(self, *args):
490 apply(URLopener.__init__, (self,) + args)
491 self.auth_cache = {}
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000492
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000493 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000494 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000495 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000496
Guido van Rossume7b146f2000-02-04 15:28:42 +0000497 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
498 """Error 302 -- relocated (temporarily)."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000499 # XXX The server can force infinite recursion here!
500 if headers.has_key('location'):
501 newurl = headers['location']
502 elif headers.has_key('uri'):
503 newurl = headers['uri']
504 else:
505 return
506 void = fp.read()
507 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000508 # In case the server sent a relative URL, join with original:
509 newurl = basejoin("http:" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000510 if data is None:
511 return self.open(newurl)
512 else:
513 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000514
Guido van Rossume7b146f2000-02-04 15:28:42 +0000515 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
516 """Error 301 -- also relocated (permanently)."""
517 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000518
Guido van Rossume7b146f2000-02-04 15:28:42 +0000519 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
520 """Error 401 -- authentication required.
521 See this URL for a description of the basic authentication scheme:
522 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000523 if headers.has_key('www-authenticate'):
524 stuff = headers['www-authenticate']
525 import re
526 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
527 if match:
528 scheme, realm = match.groups()
529 if string.lower(scheme) == 'basic':
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000530 name = 'retry_' + self.type + '_basic_auth'
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000531 if data is None:
532 return getattr(self,name)(url, realm)
533 else:
534 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000535
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000536 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000537 host, selector = splithost(url)
538 i = string.find(host, '@') + 1
539 host = host[i:]
540 user, passwd = self.get_user_passwd(host, realm, i)
541 if not (user or passwd): return None
542 host = user + ':' + passwd + '@' + host
543 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000544 if data is None:
545 return self.open(newurl)
546 else:
547 return self.open(newurl, data)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000548
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000549 def retry_https_basic_auth(self, url, realm, data=None):
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000550 host, selector = splithost(url)
551 i = string.find(host, '@') + 1
552 host = host[i:]
553 user, passwd = self.get_user_passwd(host, realm, i)
554 if not (user or passwd): return None
555 host = user + ':' + passwd + '@' + host
556 newurl = '//' + host + selector
557 return self.open_https(newurl)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000558
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000559 def get_user_passwd(self, host, realm, clear_cache = 0):
560 key = realm + '@' + string.lower(host)
561 if self.auth_cache.has_key(key):
562 if clear_cache:
563 del self.auth_cache[key]
564 else:
565 return self.auth_cache[key]
566 user, passwd = self.prompt_user_passwd(host, realm)
567 if user or passwd: self.auth_cache[key] = (user, passwd)
568 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000569
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000570 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000571 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000572 import getpass
573 try:
574 user = raw_input("Enter username for %s at %s: " % (realm,
575 host))
576 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
577 (user, realm, host))
578 return user, passwd
579 except KeyboardInterrupt:
580 print
581 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000582
583
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000584# Utility functions
585
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000586_localhost = None
587def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000588 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000589 global _localhost
590 if not _localhost:
591 _localhost = socket.gethostbyname('localhost')
592 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000593
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000594_thishost = None
595def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000596 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000597 global _thishost
598 if not _thishost:
599 _thishost = socket.gethostbyname(socket.gethostname())
600 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000601
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000602_ftperrors = None
603def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000604 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000605 global _ftperrors
606 if not _ftperrors:
607 import ftplib
608 _ftperrors = ftplib.all_errors
609 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000610
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000611_noheaders = None
612def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000613 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000614 global _noheaders
615 if not _noheaders:
616 import mimetools
617 import StringIO
618 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
619 _noheaders.fp.close() # Recycle file descriptor
620 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000621
622
623# Utility classes
624
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000625class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000626 """Class used by open_ftp() for cache of open FTP connections."""
627
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000628 def __init__(self, user, passwd, host, port, dirs):
629 self.user = user
630 self.passwd = passwd
631 self.host = host
632 self.port = port
633 self.dirs = dirs
634 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000635
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000636 def init(self):
637 import ftplib
638 self.busy = 0
639 self.ftp = ftplib.FTP()
640 self.ftp.connect(self.host, self.port)
641 self.ftp.login(self.user, self.passwd)
642 for dir in self.dirs:
643 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000644
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000645 def retrfile(self, file, type):
646 import ftplib
647 self.endtransfer()
648 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
649 else: cmd = 'TYPE ' + type; isdir = 0
650 try:
651 self.ftp.voidcmd(cmd)
652 except ftplib.all_errors:
653 self.init()
654 self.ftp.voidcmd(cmd)
655 conn = None
656 if file and not isdir:
657 # Use nlst to see if the file exists at all
658 try:
659 self.ftp.nlst(file)
660 except ftplib.error_perm, reason:
661 raise IOError, ('ftp error', reason), sys.exc_info()[2]
662 # Restore the transfer mode!
663 self.ftp.voidcmd(cmd)
664 # Try to retrieve as a file
665 try:
666 cmd = 'RETR ' + file
667 conn = self.ftp.ntransfercmd(cmd)
668 except ftplib.error_perm, reason:
669 if reason[:3] != '550':
670 raise IOError, ('ftp error', reason), sys.exc_info()[2]
671 if not conn:
672 # Set transfer mode to ASCII!
673 self.ftp.voidcmd('TYPE A')
674 # Try a directory listing
675 if file: cmd = 'LIST ' + file
676 else: cmd = 'LIST'
677 conn = self.ftp.ntransfercmd(cmd)
678 self.busy = 1
679 # Pass back both a suitably decorated object and a retrieval length
680 return (addclosehook(conn[0].makefile('rb'),
681 self.endtransfer), conn[1])
682 def endtransfer(self):
683 if not self.busy:
684 return
685 self.busy = 0
686 try:
687 self.ftp.voidresp()
688 except ftperrors():
689 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000690
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000691 def close(self):
692 self.endtransfer()
693 try:
694 self.ftp.close()
695 except ftperrors():
696 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000697
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000698class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000699 """Base class for addinfo and addclosehook."""
700
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000701 def __init__(self, fp):
702 self.fp = fp
703 self.read = self.fp.read
704 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000705 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
706 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000707
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000708 def __repr__(self):
709 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
710 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000711
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000712 def close(self):
713 self.read = None
714 self.readline = None
715 self.readlines = None
716 self.fileno = None
717 if self.fp: self.fp.close()
718 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000719
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000720class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000721 """Class to add a close hook to an open file."""
722
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000723 def __init__(self, fp, closehook, *hookargs):
724 addbase.__init__(self, fp)
725 self.closehook = closehook
726 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000727
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000728 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000729 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000730 if self.closehook:
731 apply(self.closehook, self.hookargs)
732 self.closehook = None
733 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000734
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000735class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000736 """class to add an info() method to an open file."""
737
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000738 def __init__(self, fp, headers):
739 addbase.__init__(self, fp)
740 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000741
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000742 def info(self):
743 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000744
Guido van Rossume6ad8911996-09-10 17:02:56 +0000745class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000746 """class to add info() and geturl() methods to an open file."""
747
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000748 def __init__(self, fp, headers, url):
749 addbase.__init__(self, fp)
750 self.headers = headers
751 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000752
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000753 def info(self):
754 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000755
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000756 def geturl(self):
757 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000758
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000759
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000760def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000761 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000762 type, path = splittype(url)
763 if type:
764 # if url is complete (i.e., it contains a type), return it
765 return url
766 host, path = splithost(path)
767 type, basepath = splittype(base) # inherit type from base
768 if host:
769 # if url contains host, just inherit type
770 if type: return type + '://' + host + path
771 else:
772 # no type inherited, so url must have started with //
773 # just return it
774 return url
775 host, basepath = splithost(basepath) # inherit host
776 basepath, basetag = splittag(basepath) # remove extraneuous cruft
777 basepath, basequery = splitquery(basepath) # idem
778 if path[:1] != '/':
779 # non-absolute path name
780 if path[:1] in ('#', '?'):
781 # path is just a tag or query, attach to basepath
782 i = len(basepath)
783 else:
784 # else replace last component
785 i = string.rfind(basepath, '/')
786 if i < 0:
787 # basepath not absolute
788 if host:
789 # host present, make absolute
790 basepath = '/'
791 else:
792 # else keep non-absolute
793 basepath = ''
794 else:
795 # remove last file component
796 basepath = basepath[:i+1]
797 # Interpret ../ (important because of symlinks)
798 while basepath and path[:3] == '../':
799 path = path[3:]
800 i = string.rfind(basepath[:-1], '/')
801 if i > 0:
802 basepath = basepath[:i+1]
803 elif i == 0:
804 basepath = '/'
805 break
806 else:
807 basepath = ''
808
809 path = basepath + path
810 if type and host: return type + '://' + host + path
811 elif type: return type + ':' + path
812 elif host: return '//' + host + path # don't know what this means
813 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000814
815
Guido van Rossum7c395db1994-07-04 22:14:49 +0000816# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000817# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000818# splittype('type:opaquestring') --> 'type', 'opaquestring'
819# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000820# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
821# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000822# splitport('host:port') --> 'host', 'port'
823# splitquery('/path?query') --> '/path', 'query'
824# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000825# splitattr('/path;attr1=value1;attr2=value2;...') ->
826# '/path', ['attr1=value1', 'attr2=value2', ...]
827# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000828# splitgophertype('/Xselector') --> 'X', 'selector'
829# unquote('abc%20def') -> 'abc def'
830# quote('abc def') -> 'abc%20def')
831
832def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000833 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000834 url = string.strip(url)
835 if url[:1] == '<' and url[-1:] == '>':
836 url = string.strip(url[1:-1])
837 if url[:4] == 'URL:': url = string.strip(url[4:])
838 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000839
Guido van Rossum332e1441997-09-29 23:23:46 +0000840_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000841def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000842 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000843 global _typeprog
844 if _typeprog is None:
845 import re
846 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000847
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000848 match = _typeprog.match(url)
849 if match:
850 scheme = match.group(1)
851 return scheme, url[len(scheme) + 1:]
852 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000853
Guido van Rossum332e1441997-09-29 23:23:46 +0000854_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000855def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000856 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000857 global _hostprog
858 if _hostprog is None:
859 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000860 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000861
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000862 match = _hostprog.match(url)
863 if match: return match.group(1, 2)
864 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000865
Guido van Rossum332e1441997-09-29 23:23:46 +0000866_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000867def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000868 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000869 global _userprog
870 if _userprog is None:
871 import re
872 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000873
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000874 match = _userprog.match(host)
875 if match: return match.group(1, 2)
876 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000877
Guido van Rossum332e1441997-09-29 23:23:46 +0000878_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000879def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000880 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000881 global _passwdprog
882 if _passwdprog is None:
883 import re
884 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000885
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000886 match = _passwdprog.match(user)
887 if match: return match.group(1, 2)
888 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000889
Guido van Rossume7b146f2000-02-04 15:28:42 +0000890# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000891_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000892def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000893 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000894 global _portprog
895 if _portprog is None:
896 import re
897 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000898
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000899 match = _portprog.match(host)
900 if match: return match.group(1, 2)
901 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000902
Guido van Rossum332e1441997-09-29 23:23:46 +0000903_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000904def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000905 """Split host and port, returning numeric port.
906 Return given default port if no ':' found; defaults to -1.
907 Return numerical port if a valid number are found after ':'.
908 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000909 global _nportprog
910 if _nportprog is None:
911 import re
912 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000913
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000914 match = _nportprog.match(host)
915 if match:
916 host, port = match.group(1, 2)
917 try:
918 if not port: raise string.atoi_error, "no digits"
919 nport = string.atoi(port)
920 except string.atoi_error:
921 nport = None
922 return host, nport
923 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +0000924
Guido van Rossum332e1441997-09-29 23:23:46 +0000925_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000926def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000927 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000928 global _queryprog
929 if _queryprog is None:
930 import re
931 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000932
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000933 match = _queryprog.match(url)
934 if match: return match.group(1, 2)
935 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000936
Guido van Rossum332e1441997-09-29 23:23:46 +0000937_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000938def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000939 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000940 global _tagprog
941 if _tagprog is None:
942 import re
943 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000944
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000945 match = _tagprog.match(url)
946 if match: return match.group(1, 2)
947 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000948
Guido van Rossum7c395db1994-07-04 22:14:49 +0000949def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000950 """splitattr('/path;attr1=value1;attr2=value2;...') ->
951 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000952 words = string.splitfields(url, ';')
953 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +0000954
Guido van Rossum332e1441997-09-29 23:23:46 +0000955_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000956def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000957 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000958 global _valueprog
959 if _valueprog is None:
960 import re
961 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000962
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000963 match = _valueprog.match(attr)
964 if match: return match.group(1, 2)
965 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000966
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000967def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000968 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000969 if selector[:1] == '/' and selector[1:2]:
970 return selector[1], selector[2:]
971 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000972
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000973def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000974 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000975 mychr = chr
976 myatoi = string.atoi
977 list = string.split(s, '%')
978 res = [list[0]]
979 myappend = res.append
980 del list[0]
981 for item in list:
982 if item[1:2]:
983 try:
984 myappend(mychr(myatoi(item[:2], 16))
985 + item[2:])
986 except:
987 myappend('%' + item)
988 else:
989 myappend('%' + item)
990 return string.join(res, "")
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000991
Guido van Rossum0564e121996-12-13 14:47:36 +0000992def unquote_plus(s):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000993 if '+' in s:
994 # replace '+' with ' '
995 s = string.join(string.split(s, '+'), ' ')
996 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +0000997
Guido van Rossum3bb54481994-08-29 10:52:58 +0000998always_safe = string.letters + string.digits + '_,.-'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000999def quote(s, safe = '/'):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001000 """quote('abc def') -> 'abc%20def')."""
Guido van Rossum0dee4ee1999-06-09 15:14:50 +00001001 # XXX Can speed this up an order of magnitude
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001002 safe = always_safe + safe
1003 res = list(s)
1004 for i in range(len(res)):
1005 c = res[i]
1006 if c not in safe:
1007 res[i] = '%%%02x' % ord(c)
1008 return string.joinfields(res, '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001009
Guido van Rossum0564e121996-12-13 14:47:36 +00001010def quote_plus(s, safe = '/'):
Guido van Rossum0dee4ee1999-06-09 15:14:50 +00001011 # XXX Can speed this up an order of magnitude
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001012 if ' ' in s:
1013 # replace ' ' with '+'
1014 l = string.split(s, ' ')
1015 for i in range(len(l)):
1016 l[i] = quote(l[i], safe)
1017 return string.join(l, '+')
1018 else:
1019 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001020
Guido van Rossum810a3391998-07-22 21:33:23 +00001021def urlencode(dict):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001022 """Encode a dictionary of form entries into a URL query string."""
1023 l = []
1024 for k, v in dict.items():
1025 k = quote_plus(str(k))
1026 v = quote_plus(str(v))
1027 l.append(k + '=' + v)
1028 return string.join(l, '&')
Guido van Rossum810a3391998-07-22 21:33:23 +00001029
Guido van Rossum442e7201996-03-20 15:33:11 +00001030
1031# Proxy handling
Guido van Rossum4163e701998-08-06 13:39:09 +00001032if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001033 def getproxies():
1034 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001035
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001036 By convention the mac uses Internet Config to store
1037 proxies. An HTTP proxy, for instance, is stored under
1038 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001039
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001040 """
1041 try:
1042 import ic
1043 except ImportError:
1044 return {}
1045
1046 try:
1047 config = ic.IC()
1048 except ic.error:
1049 return {}
1050 proxies = {}
1051 # HTTP:
1052 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1053 try:
1054 value = config['HTTPProxyHost']
1055 except ic.error:
1056 pass
1057 else:
1058 proxies['http'] = 'http://%s' % value
1059 # FTP: XXXX To be done.
1060 # Gopher: XXXX To be done.
1061 return proxies
1062
Guido van Rossum4163e701998-08-06 13:39:09 +00001063else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001064 def getproxies():
1065 """Return a dictionary of scheme -> proxy server URL mappings.
1066
1067 Scan the environment for variables named <scheme>_proxy;
1068 this seems to be the standard convention. If you need a
1069 different way, you can pass a proxies dictionary to the
1070 [Fancy]URLopener constructor.
1071
1072 """
1073 proxies = {}
1074 for name, value in os.environ.items():
1075 name = string.lower(name)
1076 if value and name[-6:] == '_proxy':
1077 proxies[name[:-6]] = value
1078 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001079
1080
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001081# Test and time quote() and unquote()
1082def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001083 import time
1084 s = ''
1085 for i in range(256): s = s + chr(i)
1086 s = s*4
1087 t0 = time.time()
1088 qs = quote(s)
1089 uqs = unquote(qs)
1090 t1 = time.time()
1091 if uqs != s:
1092 print 'Wrong!'
1093 print `s`
1094 print `qs`
1095 print `uqs`
1096 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001097
1098
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001099def reporthook(blocknum, blocksize, totalsize):
1100 # Report during remote transfers
1101 print "Block number: %d, Block size: %d, Total size: %d" % (blocknum, blocksize, totalsize)
1102
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001103# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001104def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001105 if not args:
1106 args = [
1107 '/etc/passwd',
1108 'file:/etc/passwd',
1109 'file://localhost/etc/passwd',
1110 'ftp://ftp.python.org/etc/passwd',
1111## 'gopher://gopher.micro.umn.edu/1/',
1112 'http://www.python.org/index.html',
1113 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001114 if hasattr(URLopener, "open_https"):
1115 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001116 try:
1117 for url in args:
1118 print '-'*10, url, '-'*10
1119 fn, h = urlretrieve(url, None, reporthook)
1120 print fn, h
1121 if h:
1122 print '======'
1123 for k in h.keys(): print k + ':', h[k]
1124 print '======'
1125 fp = open(fn, 'rb')
1126 data = fp.read()
1127 del fp
1128 if '\r' in data:
1129 table = string.maketrans("", "")
1130 data = string.translate(data, table, "\r")
1131 print data
1132 fn, h = None, None
1133 print '-'*40
1134 finally:
1135 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001136
Guido van Rossum23490151998-06-25 02:39:00 +00001137def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001138 import getopt, sys
1139 try:
1140 opts, args = getopt.getopt(sys.argv[1:], "th")
1141 except getopt.error, msg:
1142 print msg
1143 print "Use -h for help"
1144 return
1145 t = 0
1146 for o, a in opts:
1147 if o == '-t':
1148 t = t + 1
1149 if o == '-h':
1150 print "Usage: python urllib.py [-t] [url ...]"
1151 print "-t runs self-test;",
1152 print "otherwise, contents of urls are printed"
1153 return
1154 if t:
1155 if t > 1:
1156 test1()
1157 test(args)
1158 else:
1159 if not args:
1160 print "Use -h for help"
1161 for url in args:
1162 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001163
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001164# Run test program when run as a script
1165if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001166 main()