blob: badfa0ee92ba3c60a414faa42ca2b66f4654a303 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossum3c8484e1996-11-20 22:02:24 +000028import sys
Martin v. Löwis1d994332000-12-03 18:30:10 +000029import types
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000030
Martin v. Löwis1d994332000-12-03 18:30:10 +000031__version__ = '1.14' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000032
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000033MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000034
Jack Jansendc3e3f61995-12-15 13:22:13 +000035# Helper for non-unix systems
36if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000037 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000038elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000039 from nturl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000040else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000041 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000042 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000043 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000044 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000045
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000046# This really consists of two pieces:
47# (1) a class which handles opening of all sorts of URLs
48# (plus assorted utilities etc.)
49# (2) a set of functions for parsing URLs
50# XXX Should these be separated out into different modules?
51
52
53# Shortcut for basic usage
54_urlopener = None
Guido van Rossumbd013741996-12-10 16:00:28 +000055def urlopen(url, data=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000056 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000057 global _urlopener
58 if not _urlopener:
59 _urlopener = FancyURLopener()
60 if data is None:
61 return _urlopener.open(url)
62 else:
63 return _urlopener.open(url, data)
Fred Drake316a7932000-08-24 01:01:26 +000064def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000065 global _urlopener
66 if not _urlopener:
67 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000068 return _urlopener.retrieve(url, filename, reporthook, data)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000069def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000070 if _urlopener:
71 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000072
73
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000074ftpcache = {}
75class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000076 """Class to open URLs.
77 This is a class rather than just a subroutine because we may need
78 more than one set of global protocol-specific options.
79 Note -- this is a base class for those who don't want the
80 automatic handling of errors type 302 (relocated) and 401
81 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000082
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000083 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000084
Guido van Rossumba311382000-08-24 16:18:04 +000085 version = "Python-urllib/%s" % __version__
86
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000087 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000088 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000089 if proxies is None:
90 proxies = getproxies()
91 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
92 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000093 self.key_file = x509.get('key_file')
94 self.cert_file = x509.get('cert_file')
Guido van Rossumba311382000-08-24 16:18:04 +000095 self.addheaders = [('User-agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000096 self.__tempfiles = []
97 self.__unlink = os.unlink # See cleanup()
98 self.tempcache = None
99 # Undocumented feature: if you assign {} to tempcache,
100 # it is used to cache files retrieved with
101 # self.retrieve(). This is not enabled by default
102 # since it does not work for changing documents (and I
103 # haven't got the logic to check expiration headers
104 # yet).
105 self.ftpcache = ftpcache
106 # Undocumented feature: you can use a different
107 # ftp cache by assigning to the .ftpcache member;
108 # in case you want logically independent URL openers
109 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000110
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000111 def __del__(self):
112 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000113
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000114 def close(self):
115 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000116
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000117 def cleanup(self):
118 # This code sometimes runs when the rest of this module
119 # has already been deleted, so it can't use any globals
120 # or import anything.
121 if self.__tempfiles:
122 for file in self.__tempfiles:
123 try:
124 self.__unlink(file)
125 except:
126 pass
127 del self.__tempfiles[:]
128 if self.tempcache:
129 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000130
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000131 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000132 """Add a header to be used by the HTTP interface only
133 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000134 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000135
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000136 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000137 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000138 """Use URLopener().open(file) instead of open(file, 'r')."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000139 fullurl = unwrap(toBytes(fullurl))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000140 if self.tempcache and self.tempcache.has_key(fullurl):
141 filename, headers = self.tempcache[fullurl]
142 fp = open(filename, 'rb')
143 return addinfourl(fp, headers, fullurl)
Martin v. Löwis1d994332000-12-03 18:30:10 +0000144 urltype, url = splittype(fullurl)
145 if not urltype:
146 urltype = 'file'
147 if self.proxies.has_key(urltype):
148 proxy = self.proxies[urltype]
149 urltype, proxyhost = splittype(proxy)
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000150 host, selector = splithost(proxyhost)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000151 url = (host, fullurl) # Signal special case to open_*()
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000152 else:
153 proxy = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000154 name = 'open_' + urltype
155 self.type = urltype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000156 if '-' in name:
157 # replace - with _
158 name = string.join(string.split(name, '-'), '_')
159 if not hasattr(self, name):
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000160 if proxy:
161 return self.open_unknown_proxy(proxy, fullurl, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000162 else:
163 return self.open_unknown(fullurl, data)
164 try:
165 if data is None:
166 return getattr(self, name)(url)
167 else:
168 return getattr(self, name)(url, data)
169 except socket.error, msg:
170 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000171
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000172 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000173 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000174 type, url = splittype(fullurl)
175 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000176
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000177 def open_unknown_proxy(self, proxy, fullurl, data=None):
178 """Overridable interface to open unknown URL type."""
179 type, url = splittype(fullurl)
180 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
181
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000182 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000183 def retrieve(self, url, filename=None, reporthook=None, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000184 """retrieve(url) returns (filename, None) for a local object
185 or (tempfilename, headers) for a remote object."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000186 url = unwrap(toBytes(url))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000187 if self.tempcache and self.tempcache.has_key(url):
188 return self.tempcache[url]
189 type, url1 = splittype(url)
190 if not filename and (not type or type == 'file'):
191 try:
192 fp = self.open_local_file(url1)
193 hdrs = fp.info()
194 del fp
195 return url2pathname(splithost(url1)[1]), hdrs
196 except IOError, msg:
197 pass
Fred Drake316a7932000-08-24 01:01:26 +0000198 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000199 headers = fp.info()
200 if not filename:
201 import tempfile
202 garbage, path = splittype(url)
203 garbage, path = splithost(path or "")
204 path, garbage = splitquery(path or "")
205 path, garbage = splitattr(path or "")
206 suffix = os.path.splitext(path)[1]
207 filename = tempfile.mktemp(suffix)
208 self.__tempfiles.append(filename)
209 result = filename, headers
210 if self.tempcache is not None:
211 self.tempcache[url] = result
212 tfp = open(filename, 'wb')
213 bs = 1024*8
214 size = -1
215 blocknum = 1
216 if reporthook:
217 if headers.has_key("content-length"):
218 size = int(headers["Content-Length"])
219 reporthook(0, bs, size)
220 block = fp.read(bs)
221 if reporthook:
222 reporthook(1, bs, size)
223 while block:
224 tfp.write(block)
225 block = fp.read(bs)
226 blocknum = blocknum + 1
227 if reporthook:
228 reporthook(blocknum, bs, size)
229 fp.close()
230 tfp.close()
231 del fp
232 del tfp
233 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000234
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000235 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000236
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000237 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000238 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000239 import httplib
240 user_passwd = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000241 if type(url) is types.StringType:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000242 host, selector = splithost(url)
243 if host:
244 user_passwd, host = splituser(host)
245 host = unquote(host)
246 realhost = host
247 else:
248 host, selector = url
249 urltype, rest = splittype(selector)
250 url = rest
251 user_passwd = None
252 if string.lower(urltype) != 'http':
253 realhost = None
254 else:
255 realhost, rest = splithost(rest)
256 if realhost:
257 user_passwd, realhost = splituser(realhost)
258 if user_passwd:
259 selector = "%s://%s%s" % (urltype, realhost, rest)
260 #print "proxy via http:", host, selector
261 if not host: raise IOError, ('http error', 'no host given')
262 if user_passwd:
263 import base64
264 auth = string.strip(base64.encodestring(user_passwd))
265 else:
266 auth = None
267 h = httplib.HTTP(host)
268 if data is not None:
269 h.putrequest('POST', selector)
270 h.putheader('Content-type', 'application/x-www-form-urlencoded')
271 h.putheader('Content-length', '%d' % len(data))
272 else:
273 h.putrequest('GET', selector)
274 if auth: h.putheader('Authorization', 'Basic %s' % auth)
275 if realhost: h.putheader('Host', realhost)
276 for args in self.addheaders: apply(h.putheader, args)
277 h.endheaders()
278 if data is not None:
279 h.send(data + '\r\n')
280 errcode, errmsg, headers = h.getreply()
281 fp = h.getfile()
282 if errcode == 200:
283 return addinfourl(fp, headers, "http:" + url)
284 else:
285 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000286 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000287 else:
288 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000289
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000290 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000291 """Handle http errors.
292 Derived class can override this, or provide specific handlers
293 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000294 # First check if there's a specific handler for this error
295 name = 'http_error_%d' % errcode
296 if hasattr(self, name):
297 method = getattr(self, name)
298 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000299 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000300 else:
301 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000302 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000303 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000304
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000305 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000306 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000307 void = fp.read()
308 fp.close()
309 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000310
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000311 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000312 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000313 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000314 import httplib
Fred Drake567ca8e2000-08-21 21:42:42 +0000315 user_passwd = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000316 if type(url) in types.StringTypes:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000317 host, selector = splithost(url)
Fred Drake567ca8e2000-08-21 21:42:42 +0000318 if host:
319 user_passwd, host = splituser(host)
320 host = unquote(host)
321 realhost = host
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000322 else:
323 host, selector = url
324 urltype, rest = splittype(selector)
Fred Drake567ca8e2000-08-21 21:42:42 +0000325 url = rest
326 user_passwd = None
327 if string.lower(urltype) != 'https':
328 realhost = None
329 else:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000330 realhost, rest = splithost(rest)
Fred Drake567ca8e2000-08-21 21:42:42 +0000331 if realhost:
332 user_passwd, realhost = splituser(realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000333 if user_passwd:
334 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000335 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000336 if not host: raise IOError, ('https error', 'no host given')
337 if user_passwd:
338 import base64
339 auth = string.strip(base64.encodestring(user_passwd))
340 else:
341 auth = None
342 h = httplib.HTTPS(host, 0,
343 key_file=self.key_file,
344 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000345 if data is not None:
346 h.putrequest('POST', selector)
347 h.putheader('Content-type',
348 'application/x-www-form-urlencoded')
349 h.putheader('Content-length', '%d' % len(data))
350 else:
351 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000352 if auth: h.putheader('Authorization: Basic %s' % auth)
Fred Drake567ca8e2000-08-21 21:42:42 +0000353 if realhost: h.putheader('Host', realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000354 for args in self.addheaders: apply(h.putheader, args)
355 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000356 if data is not None:
357 h.send(data + '\r\n')
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000358 errcode, errmsg, headers = h.getreply()
359 fp = h.getfile()
360 if errcode == 200:
361 return addinfourl(fp, headers, url)
362 else:
Fred Drake567ca8e2000-08-21 21:42:42 +0000363 if data is None:
364 return self.http_error(url, fp, errcode, errmsg, headers)
365 else:
366 return self.http_error(url, fp, errcode, errmsg, headers, data)
367
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000368 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000369 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000370 import gopherlib
371 host, selector = splithost(url)
372 if not host: raise IOError, ('gopher error', 'no host given')
373 host = unquote(host)
374 type, selector = splitgophertype(selector)
375 selector, query = splitquery(selector)
376 selector = unquote(selector)
377 if query:
378 query = unquote(query)
379 fp = gopherlib.send_query(selector, query, host)
380 else:
381 fp = gopherlib.send_selector(selector, host)
382 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000383
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000384 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000385 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000386 if url[:2] == '//' and url[2:3] != '/':
387 return self.open_ftp(url)
388 else:
389 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000390
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000391 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000392 """Use local file."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000393 import mimetypes, mimetools, StringIO
394 mtype = mimetypes.guess_type(url)[0]
395 headers = mimetools.Message(StringIO.StringIO(
396 'Content-Type: %s\n' % (mtype or 'text/plain')))
397 host, file = splithost(url)
398 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000399 urlfile = file
400 if file[:1] == '/':
401 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000402 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000403 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000404 host, port = splitport(host)
405 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000406 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000407 urlfile = file
408 if file[:1] == '/':
409 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000410 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000411 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000412 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000413
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000414 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000415 """Use FTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000416 host, path = splithost(url)
417 if not host: raise IOError, ('ftp error', 'no host given')
418 host, port = splitport(host)
419 user, host = splituser(host)
420 if user: user, passwd = splitpasswd(user)
421 else: passwd = None
422 host = unquote(host)
423 user = unquote(user or '')
424 passwd = unquote(passwd or '')
425 host = socket.gethostbyname(host)
426 if not port:
427 import ftplib
428 port = ftplib.FTP_PORT
429 else:
430 port = int(port)
431 path, attrs = splitattr(path)
432 path = unquote(path)
433 dirs = string.splitfields(path, '/')
434 dirs, file = dirs[:-1], dirs[-1]
435 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000436 if dirs and not dirs[0]: dirs[0] = '/'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +0000437 key = user, host, port, string.join(dirs, '/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000438 # XXX thread unsafe!
439 if len(self.ftpcache) > MAXFTPCACHE:
440 # Prune the cache, rather arbitrarily
441 for k in self.ftpcache.keys():
442 if k != key:
443 v = self.ftpcache[k]
444 del self.ftpcache[k]
445 v.close()
446 try:
447 if not self.ftpcache.has_key(key):
448 self.ftpcache[key] = \
449 ftpwrapper(user, passwd, host, port, dirs)
450 if not file: type = 'D'
451 else: type = 'I'
452 for attr in attrs:
453 attr, value = splitvalue(attr)
454 if string.lower(attr) == 'type' and \
455 value in ('a', 'A', 'i', 'I', 'd', 'D'):
456 type = string.upper(value)
457 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
458 if retrlen is not None and retrlen >= 0:
459 import mimetools, StringIO
460 headers = mimetools.Message(StringIO.StringIO(
461 'Content-Length: %d\n' % retrlen))
462 else:
463 headers = noheaders()
464 return addinfourl(fp, headers, "ftp:" + url)
465 except ftperrors(), msg:
466 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000467
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000468 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000469 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000470 # ignore POSTed data
471 #
472 # syntax of data URLs:
473 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
474 # mediatype := [ type "/" subtype ] *( ";" parameter )
475 # data := *urlchar
476 # parameter := attribute "=" value
477 import StringIO, mimetools, time
478 try:
479 [type, data] = string.split(url, ',', 1)
480 except ValueError:
481 raise IOError, ('data error', 'bad data URL')
482 if not type:
483 type = 'text/plain;charset=US-ASCII'
484 semi = string.rfind(type, ';')
485 if semi >= 0 and '=' not in type[semi:]:
486 encoding = type[semi+1:]
487 type = type[:semi]
488 else:
489 encoding = ''
490 msg = []
491 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
492 time.gmtime(time.time())))
493 msg.append('Content-type: %s' % type)
494 if encoding == 'base64':
495 import base64
496 data = base64.decodestring(data)
497 else:
498 data = unquote(data)
499 msg.append('Content-length: %d' % len(data))
500 msg.append('')
501 msg.append(data)
502 msg = string.join(msg, '\n')
503 f = StringIO.StringIO(msg)
504 headers = mimetools.Message(f, 0)
505 f.fileno = None # needed for addinfourl
506 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000507
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000508
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000509class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000510 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000511
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000512 def __init__(self, *args):
513 apply(URLopener.__init__, (self,) + args)
514 self.auth_cache = {}
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000515
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000516 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000517 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000518 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000519
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000520 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000521 """Error 302 -- relocated (temporarily)."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000522 # XXX The server can force infinite recursion here!
523 if headers.has_key('location'):
524 newurl = headers['location']
525 elif headers.has_key('uri'):
526 newurl = headers['uri']
527 else:
528 return
529 void = fp.read()
530 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000531 # In case the server sent a relative URL, join with original:
532 newurl = basejoin("http:" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000533 if data is None:
534 return self.open(newurl)
535 else:
536 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000537
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000538 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000539 """Error 301 -- also relocated (permanently)."""
540 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000541
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000542 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000543 """Error 401 -- authentication required.
544 See this URL for a description of the basic authentication scheme:
545 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000546 if headers.has_key('www-authenticate'):
547 stuff = headers['www-authenticate']
548 import re
549 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
550 if match:
551 scheme, realm = match.groups()
552 if string.lower(scheme) == 'basic':
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000553 name = 'retry_' + self.type + '_basic_auth'
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000554 if data is None:
555 return getattr(self,name)(url, realm)
556 else:
557 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000558
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000559 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000560 host, selector = splithost(url)
561 i = string.find(host, '@') + 1
562 host = host[i:]
563 user, passwd = self.get_user_passwd(host, realm, i)
564 if not (user or passwd): return None
565 host = user + ':' + passwd + '@' + host
566 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000567 if data is None:
568 return self.open(newurl)
569 else:
570 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000571
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000572 def retry_https_basic_auth(self, url, realm, data=None):
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000573 host, selector = splithost(url)
574 i = string.find(host, '@') + 1
575 host = host[i:]
576 user, passwd = self.get_user_passwd(host, realm, i)
577 if not (user or passwd): return None
578 host = user + ':' + passwd + '@' + host
579 newurl = '//' + host + selector
580 return self.open_https(newurl)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000581
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000582 def get_user_passwd(self, host, realm, clear_cache = 0):
583 key = realm + '@' + string.lower(host)
584 if self.auth_cache.has_key(key):
585 if clear_cache:
586 del self.auth_cache[key]
587 else:
588 return self.auth_cache[key]
589 user, passwd = self.prompt_user_passwd(host, realm)
590 if user or passwd: self.auth_cache[key] = (user, passwd)
591 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000592
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000593 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000594 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000595 import getpass
596 try:
597 user = raw_input("Enter username for %s at %s: " % (realm,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000598 host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000599 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
600 (user, realm, host))
601 return user, passwd
602 except KeyboardInterrupt:
603 print
604 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000605
606
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000607# Utility functions
608
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000609_localhost = None
610def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000611 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000612 global _localhost
613 if not _localhost:
614 _localhost = socket.gethostbyname('localhost')
615 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000616
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000617_thishost = None
618def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000619 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000620 global _thishost
621 if not _thishost:
622 _thishost = socket.gethostbyname(socket.gethostname())
623 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000624
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000625_ftperrors = None
626def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000627 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000628 global _ftperrors
629 if not _ftperrors:
630 import ftplib
631 _ftperrors = ftplib.all_errors
632 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000633
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000634_noheaders = None
635def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000636 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000637 global _noheaders
638 if not _noheaders:
639 import mimetools
640 import StringIO
641 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
642 _noheaders.fp.close() # Recycle file descriptor
643 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000644
645
646# Utility classes
647
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000648class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000649 """Class used by open_ftp() for cache of open FTP connections."""
650
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000651 def __init__(self, user, passwd, host, port, dirs):
652 self.user = user
653 self.passwd = passwd
654 self.host = host
655 self.port = port
656 self.dirs = dirs
657 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000658
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000659 def init(self):
660 import ftplib
661 self.busy = 0
662 self.ftp = ftplib.FTP()
663 self.ftp.connect(self.host, self.port)
664 self.ftp.login(self.user, self.passwd)
665 for dir in self.dirs:
666 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000667
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000668 def retrfile(self, file, type):
669 import ftplib
670 self.endtransfer()
671 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
672 else: cmd = 'TYPE ' + type; isdir = 0
673 try:
674 self.ftp.voidcmd(cmd)
675 except ftplib.all_errors:
676 self.init()
677 self.ftp.voidcmd(cmd)
678 conn = None
679 if file and not isdir:
680 # Use nlst to see if the file exists at all
681 try:
682 self.ftp.nlst(file)
683 except ftplib.error_perm, reason:
684 raise IOError, ('ftp error', reason), sys.exc_info()[2]
685 # Restore the transfer mode!
686 self.ftp.voidcmd(cmd)
687 # Try to retrieve as a file
688 try:
689 cmd = 'RETR ' + file
690 conn = self.ftp.ntransfercmd(cmd)
691 except ftplib.error_perm, reason:
692 if reason[:3] != '550':
693 raise IOError, ('ftp error', reason), sys.exc_info()[2]
694 if not conn:
695 # Set transfer mode to ASCII!
696 self.ftp.voidcmd('TYPE A')
697 # Try a directory listing
698 if file: cmd = 'LIST ' + file
699 else: cmd = 'LIST'
700 conn = self.ftp.ntransfercmd(cmd)
701 self.busy = 1
702 # Pass back both a suitably decorated object and a retrieval length
703 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000704 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000705 def endtransfer(self):
706 if not self.busy:
707 return
708 self.busy = 0
709 try:
710 self.ftp.voidresp()
711 except ftperrors():
712 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000713
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000714 def close(self):
715 self.endtransfer()
716 try:
717 self.ftp.close()
718 except ftperrors():
719 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000720
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000721class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000722 """Base class for addinfo and addclosehook."""
723
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000724 def __init__(self, fp):
725 self.fp = fp
726 self.read = self.fp.read
727 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000728 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
729 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000730
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000731 def __repr__(self):
732 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000733 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000734
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000735 def close(self):
736 self.read = None
737 self.readline = None
738 self.readlines = None
739 self.fileno = None
740 if self.fp: self.fp.close()
741 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000742
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000743class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000744 """Class to add a close hook to an open file."""
745
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000746 def __init__(self, fp, closehook, *hookargs):
747 addbase.__init__(self, fp)
748 self.closehook = closehook
749 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000750
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000751 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000752 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000753 if self.closehook:
754 apply(self.closehook, self.hookargs)
755 self.closehook = None
756 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000757
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000758class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000759 """class to add an info() method to an open file."""
760
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000761 def __init__(self, fp, headers):
762 addbase.__init__(self, fp)
763 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000764
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000765 def info(self):
766 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000767
Guido van Rossume6ad8911996-09-10 17:02:56 +0000768class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000769 """class to add info() and geturl() methods to an open file."""
770
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000771 def __init__(self, fp, headers, url):
772 addbase.__init__(self, fp)
773 self.headers = headers
774 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000775
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000776 def info(self):
777 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000778
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000779 def geturl(self):
780 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000781
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000782
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000783def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000784 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000785 type, path = splittype(url)
786 if type:
787 # if url is complete (i.e., it contains a type), return it
788 return url
789 host, path = splithost(path)
790 type, basepath = splittype(base) # inherit type from base
791 if host:
792 # if url contains host, just inherit type
793 if type: return type + '://' + host + path
794 else:
795 # no type inherited, so url must have started with //
796 # just return it
797 return url
798 host, basepath = splithost(basepath) # inherit host
Thomas Wouters7e474022000-07-16 12:04:32 +0000799 basepath, basetag = splittag(basepath) # remove extraneous cruft
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000800 basepath, basequery = splitquery(basepath) # idem
801 if path[:1] != '/':
802 # non-absolute path name
803 if path[:1] in ('#', '?'):
804 # path is just a tag or query, attach to basepath
805 i = len(basepath)
806 else:
807 # else replace last component
808 i = string.rfind(basepath, '/')
809 if i < 0:
810 # basepath not absolute
811 if host:
812 # host present, make absolute
813 basepath = '/'
814 else:
815 # else keep non-absolute
816 basepath = ''
817 else:
818 # remove last file component
819 basepath = basepath[:i+1]
820 # Interpret ../ (important because of symlinks)
821 while basepath and path[:3] == '../':
822 path = path[3:]
823 i = string.rfind(basepath[:-1], '/')
824 if i > 0:
825 basepath = basepath[:i+1]
826 elif i == 0:
827 basepath = '/'
828 break
829 else:
830 basepath = ''
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000831
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000832 path = basepath + path
833 if type and host: return type + '://' + host + path
834 elif type: return type + ':' + path
835 elif host: return '//' + host + path # don't know what this means
836 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000837
838
Guido van Rossum7c395db1994-07-04 22:14:49 +0000839# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000840# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000841# splittype('type:opaquestring') --> 'type', 'opaquestring'
842# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000843# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
844# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000845# splitport('host:port') --> 'host', 'port'
846# splitquery('/path?query') --> '/path', 'query'
847# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000848# splitattr('/path;attr1=value1;attr2=value2;...') ->
849# '/path', ['attr1=value1', 'attr2=value2', ...]
850# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000851# splitgophertype('/Xselector') --> 'X', 'selector'
852# unquote('abc%20def') -> 'abc def'
853# quote('abc def') -> 'abc%20def')
854
Martin v. Löwis1d994332000-12-03 18:30:10 +0000855def toBytes(url):
856 """toBytes(u"URL") --> 'URL'."""
857 # Most URL schemes require ASCII. If that changes, the conversion
858 # can be relaxed
859 if type(url) is types.UnicodeType:
860 try:
861 url = url.encode("ASCII")
862 except UnicodeError:
863 raise UnicodeError("URL "+repr(url)+" contains non-ASCII characters")
864 return url
865
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000866def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000867 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000868 url = string.strip(url)
869 if url[:1] == '<' and url[-1:] == '>':
870 url = string.strip(url[1:-1])
871 if url[:4] == 'URL:': url = string.strip(url[4:])
872 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000873
Guido van Rossum332e1441997-09-29 23:23:46 +0000874_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000875def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000876 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000877 global _typeprog
878 if _typeprog is None:
879 import re
880 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000881
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000882 match = _typeprog.match(url)
883 if match:
884 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000885 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000886 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000887
Guido van Rossum332e1441997-09-29 23:23:46 +0000888_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000889def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000890 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000891 global _hostprog
892 if _hostprog is None:
893 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000894 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000895
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000896 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000897 if match: return match.group(1, 2)
898 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000899
Guido van Rossum332e1441997-09-29 23:23:46 +0000900_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000901def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000902 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000903 global _userprog
904 if _userprog is None:
905 import re
906 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000907
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000908 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000909 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000910 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000911
Guido van Rossum332e1441997-09-29 23:23:46 +0000912_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000913def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000914 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000915 global _passwdprog
916 if _passwdprog is None:
917 import re
918 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000919
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000920 match = _passwdprog.match(user)
921 if match: return match.group(1, 2)
922 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000923
Guido van Rossume7b146f2000-02-04 15:28:42 +0000924# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000925_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000926def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000927 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000928 global _portprog
929 if _portprog is None:
930 import re
931 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000932
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000933 match = _portprog.match(host)
934 if match: return match.group(1, 2)
935 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000936
Guido van Rossum332e1441997-09-29 23:23:46 +0000937_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000938def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000939 """Split host and port, returning numeric port.
940 Return given default port if no ':' found; defaults to -1.
941 Return numerical port if a valid number are found after ':'.
942 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000943 global _nportprog
944 if _nportprog is None:
945 import re
946 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000947
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000948 match = _nportprog.match(host)
949 if match:
950 host, port = match.group(1, 2)
951 try:
952 if not port: raise string.atoi_error, "no digits"
953 nport = string.atoi(port)
954 except string.atoi_error:
955 nport = None
956 return host, nport
957 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +0000958
Guido van Rossum332e1441997-09-29 23:23:46 +0000959_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000960def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000961 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000962 global _queryprog
963 if _queryprog is None:
964 import re
965 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000966
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000967 match = _queryprog.match(url)
968 if match: return match.group(1, 2)
969 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000970
Guido van Rossum332e1441997-09-29 23:23:46 +0000971_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000972def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000973 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000974 global _tagprog
975 if _tagprog is None:
976 import re
977 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000978
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000979 match = _tagprog.match(url)
980 if match: return match.group(1, 2)
981 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000982
Guido van Rossum7c395db1994-07-04 22:14:49 +0000983def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000984 """splitattr('/path;attr1=value1;attr2=value2;...') ->
985 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000986 words = string.splitfields(url, ';')
987 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +0000988
Guido van Rossum332e1441997-09-29 23:23:46 +0000989_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000990def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000991 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000992 global _valueprog
993 if _valueprog is None:
994 import re
995 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000996
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000997 match = _valueprog.match(attr)
998 if match: return match.group(1, 2)
999 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001000
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001001def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001002 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001003 if selector[:1] == '/' and selector[1:2]:
1004 return selector[1], selector[2:]
1005 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001006
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001007def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001008 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001009 mychr = chr
1010 myatoi = string.atoi
1011 list = string.split(s, '%')
1012 res = [list[0]]
1013 myappend = res.append
1014 del list[0]
1015 for item in list:
1016 if item[1:2]:
1017 try:
1018 myappend(mychr(myatoi(item[:2], 16))
1019 + item[2:])
1020 except:
1021 myappend('%' + item)
1022 else:
1023 myappend('%' + item)
1024 return string.join(res, "")
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001025
Guido van Rossum0564e121996-12-13 14:47:36 +00001026def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001027 """unquote('%7e/abc+def') -> '~/abc def'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001028 if '+' in s:
1029 # replace '+' with ' '
1030 s = string.join(string.split(s, '+'), ' ')
1031 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001032
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001033always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001034 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001035 '0123456789' '_.-')
1036
1037_fast_safe_test = always_safe + '/'
1038_fast_safe = None
1039
1040def _fast_quote(s):
1041 global _fast_safe
1042 if _fast_safe is None:
1043 _fast_safe = {}
1044 for c in _fast_safe_test:
1045 _fast_safe[c] = c
1046 res = list(s)
1047 for i in range(len(res)):
1048 c = res[i]
1049 if not _fast_safe.has_key(c):
1050 res[i] = '%%%02x' % ord(c)
1051 return string.join(res, '')
1052
Guido van Rossum7c395db1994-07-04 22:14:49 +00001053def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001054 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001055
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001056 Each part of a URL, e.g. the path info, the query, etc., has a
1057 different set of reserved characters that must be quoted.
1058
1059 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1060 the following reserved characters.
1061
1062 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1063 "$" | ","
1064
1065 Each of these characters is reserved in some component of a URL,
1066 but not necessarily in all of them.
1067
1068 By default, the quote function is intended for quoting the path
1069 section of a URL. Thus, it will not encode '/'. This character
1070 is reserved, but in typical usage the quote function is being
1071 called on a path where the existing slash characters are used as
1072 reserved characters.
1073 """
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001074 safe = always_safe + safe
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001075 if _fast_safe_test == safe:
1076 return _fast_quote(s)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001077 res = list(s)
1078 for i in range(len(res)):
1079 c = res[i]
1080 if c not in safe:
1081 res[i] = '%%%02x' % ord(c)
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001082 return string.join(res, '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001083
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001084def quote_plus(s, safe = ''):
1085 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001086 if ' ' in s:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001087 l = string.split(s, ' ')
1088 for i in range(len(l)):
1089 l[i] = quote(l[i], safe)
1090 return string.join(l, '+')
1091 else:
1092 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001093
Guido van Rossum810a3391998-07-22 21:33:23 +00001094def urlencode(dict):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001095 """Encode a dictionary of form entries into a URL query string."""
1096 l = []
1097 for k, v in dict.items():
1098 k = quote_plus(str(k))
1099 v = quote_plus(str(v))
1100 l.append(k + '=' + v)
1101 return string.join(l, '&')
Guido van Rossum810a3391998-07-22 21:33:23 +00001102
Guido van Rossum442e7201996-03-20 15:33:11 +00001103# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001104def getproxies_environment():
1105 """Return a dictionary of scheme -> proxy server URL mappings.
1106
1107 Scan the environment for variables named <scheme>_proxy;
1108 this seems to be the standard convention. If you need a
1109 different way, you can pass a proxies dictionary to the
1110 [Fancy]URLopener constructor.
1111
1112 """
1113 proxies = {}
1114 for name, value in os.environ.items():
1115 name = string.lower(name)
1116 if value and name[-6:] == '_proxy':
1117 proxies[name[:-6]] = value
1118 return proxies
1119
Guido van Rossum4163e701998-08-06 13:39:09 +00001120if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001121 def getproxies():
1122 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001123
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001124 By convention the mac uses Internet Config to store
1125 proxies. An HTTP proxy, for instance, is stored under
1126 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001127
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001128 """
1129 try:
1130 import ic
1131 except ImportError:
1132 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001133
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001134 try:
1135 config = ic.IC()
1136 except ic.error:
1137 return {}
1138 proxies = {}
1139 # HTTP:
1140 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1141 try:
1142 value = config['HTTPProxyHost']
1143 except ic.error:
1144 pass
1145 else:
1146 proxies['http'] = 'http://%s' % value
1147 # FTP: XXXX To be done.
1148 # Gopher: XXXX To be done.
1149 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001150
1151elif os.name == 'nt':
1152 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001153 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001154
1155 Win32 uses the registry to store proxies.
1156
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001157 """
1158 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001159 try:
1160 import _winreg
1161 except ImportError:
1162 # Std module, so should be around - but you never know!
1163 return proxies
1164 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001165 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1166 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001167 proxyEnable = _winreg.QueryValueEx(internetSettings,
1168 'ProxyEnable')[0]
1169 if proxyEnable:
1170 # Returned as Unicode but problems if not converted to ASCII
1171 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1172 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001173 if '=' in proxyServer:
1174 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001175 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001176 protocol, address = p.split('=', 1)
Mark Hammond4f570b92000-07-26 07:04:38 +00001177 proxies[protocol] = '%s://%s' % (protocol, address)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001178 else:
1179 # Use one setting for all protocols
1180 if proxyServer[:5] == 'http:':
1181 proxies['http'] = proxyServer
1182 else:
1183 proxies['http'] = 'http://%s' % proxyServer
1184 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001185 internetSettings.Close()
1186 except (WindowsError, ValueError, TypeError):
1187 # Either registry key not found etc, or the value in an
1188 # unexpected format.
1189 # proxies already set up to be empty so nothing to do
1190 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001191 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001192
Mark Hammond4f570b92000-07-26 07:04:38 +00001193 def getproxies():
1194 """Return a dictionary of scheme -> proxy server URL mappings.
1195
1196 Returns settings gathered from the environment, if specified,
1197 or the registry.
1198
1199 """
1200 return getproxies_environment() or getproxies_registry()
1201else:
1202 # By default use environment variables
1203 getproxies = getproxies_environment
1204
Guido van Rossum442e7201996-03-20 15:33:11 +00001205
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001206# Test and time quote() and unquote()
1207def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001208 import time
1209 s = ''
1210 for i in range(256): s = s + chr(i)
1211 s = s*4
1212 t0 = time.time()
1213 qs = quote(s)
1214 uqs = unquote(qs)
1215 t1 = time.time()
1216 if uqs != s:
1217 print 'Wrong!'
1218 print `s`
1219 print `qs`
1220 print `uqs`
1221 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001222
1223
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001224def reporthook(blocknum, blocksize, totalsize):
1225 # Report during remote transfers
1226 print "Block number: %d, Block size: %d, Total size: %d" % (blocknum, blocksize, totalsize)
1227
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001228# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001229def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001230 if not args:
1231 args = [
1232 '/etc/passwd',
1233 'file:/etc/passwd',
1234 'file://localhost/etc/passwd',
1235 'ftp://ftp.python.org/etc/passwd',
1236## 'gopher://gopher.micro.umn.edu/1/',
1237 'http://www.python.org/index.html',
1238 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001239 if hasattr(URLopener, "open_https"):
1240 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001241 try:
1242 for url in args:
1243 print '-'*10, url, '-'*10
1244 fn, h = urlretrieve(url, None, reporthook)
1245 print fn, h
1246 if h:
1247 print '======'
1248 for k in h.keys(): print k + ':', h[k]
1249 print '======'
1250 fp = open(fn, 'rb')
1251 data = fp.read()
1252 del fp
1253 if '\r' in data:
1254 table = string.maketrans("", "")
1255 data = string.translate(data, table, "\r")
1256 print data
1257 fn, h = None, None
1258 print '-'*40
1259 finally:
1260 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001261
Guido van Rossum23490151998-06-25 02:39:00 +00001262def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001263 import getopt, sys
1264 try:
1265 opts, args = getopt.getopt(sys.argv[1:], "th")
1266 except getopt.error, msg:
1267 print msg
1268 print "Use -h for help"
1269 return
1270 t = 0
1271 for o, a in opts:
1272 if o == '-t':
1273 t = t + 1
1274 if o == '-h':
1275 print "Usage: python urllib.py [-t] [url ...]"
1276 print "-t runs self-test;",
1277 print "otherwise, contents of urls are printed"
1278 return
1279 if t:
1280 if t > 1:
1281 test1()
1282 test(args)
1283 else:
1284 if not args:
1285 print "Use -h for help"
1286 for url in args:
1287 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001288
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001289# Run test program when run as a script
1290if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001291 main()