blob: ff557761ddd77ac6bb826832f1ac10acf4f0d7c8 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossum3c8484e1996-11-20 22:02:24 +000028import sys
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000029
30
Guido van Rossumba311382000-08-24 16:18:04 +000031__version__ = '1.13' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000032
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000033MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000034
Jack Jansendc3e3f61995-12-15 13:22:13 +000035# Helper for non-unix systems
36if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000037 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000038elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000039 from nturl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000040else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000041 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000042 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000043 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000044 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000045
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000046# This really consists of two pieces:
47# (1) a class which handles opening of all sorts of URLs
48# (plus assorted utilities etc.)
49# (2) a set of functions for parsing URLs
50# XXX Should these be separated out into different modules?
51
52
53# Shortcut for basic usage
54_urlopener = None
Guido van Rossumbd013741996-12-10 16:00:28 +000055def urlopen(url, data=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000056 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000057 global _urlopener
58 if not _urlopener:
59 _urlopener = FancyURLopener()
60 if data is None:
61 return _urlopener.open(url)
62 else:
63 return _urlopener.open(url, data)
Fred Drake316a7932000-08-24 01:01:26 +000064def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000065 global _urlopener
66 if not _urlopener:
67 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000068 return _urlopener.retrieve(url, filename, reporthook, data)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000069def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000070 if _urlopener:
71 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000072
73
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000074ftpcache = {}
75class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000076 """Class to open URLs.
77 This is a class rather than just a subroutine because we may need
78 more than one set of global protocol-specific options.
79 Note -- this is a base class for those who don't want the
80 automatic handling of errors type 302 (relocated) and 401
81 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000082
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000083 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000084
Guido van Rossumba311382000-08-24 16:18:04 +000085 version = "Python-urllib/%s" % __version__
86
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000087 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000088 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000089 if proxies is None:
90 proxies = getproxies()
91 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
92 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000093 self.key_file = x509.get('key_file')
94 self.cert_file = x509.get('cert_file')
Guido van Rossumba311382000-08-24 16:18:04 +000095 self.addheaders = [('User-agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000096 self.__tempfiles = []
97 self.__unlink = os.unlink # See cleanup()
98 self.tempcache = None
99 # Undocumented feature: if you assign {} to tempcache,
100 # it is used to cache files retrieved with
101 # self.retrieve(). This is not enabled by default
102 # since it does not work for changing documents (and I
103 # haven't got the logic to check expiration headers
104 # yet).
105 self.ftpcache = ftpcache
106 # Undocumented feature: you can use a different
107 # ftp cache by assigning to the .ftpcache member;
108 # in case you want logically independent URL openers
109 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000110
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000111 def __del__(self):
112 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000113
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000114 def close(self):
115 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000116
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000117 def cleanup(self):
118 # This code sometimes runs when the rest of this module
119 # has already been deleted, so it can't use any globals
120 # or import anything.
121 if self.__tempfiles:
122 for file in self.__tempfiles:
123 try:
124 self.__unlink(file)
125 except:
126 pass
127 del self.__tempfiles[:]
128 if self.tempcache:
129 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000130
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000131 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000132 """Add a header to be used by the HTTP interface only
133 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000134 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000135
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000136 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000137 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000138 """Use URLopener().open(file) instead of open(file, 'r')."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000139 fullurl = unwrap(fullurl)
140 if self.tempcache and self.tempcache.has_key(fullurl):
141 filename, headers = self.tempcache[fullurl]
142 fp = open(filename, 'rb')
143 return addinfourl(fp, headers, fullurl)
144 type, url = splittype(fullurl)
145 if not type: type = 'file'
146 if self.proxies.has_key(type):
147 proxy = self.proxies[type]
148 type, proxy = splittype(proxy)
149 host, selector = splithost(proxy)
150 url = (host, fullurl) # Signal special case to open_*()
151 name = 'open_' + type
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000152 self.type = type
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000153 if '-' in name:
154 # replace - with _
155 name = string.join(string.split(name, '-'), '_')
156 if not hasattr(self, name):
157 if data is None:
158 return self.open_unknown(fullurl)
159 else:
160 return self.open_unknown(fullurl, data)
161 try:
162 if data is None:
163 return getattr(self, name)(url)
164 else:
165 return getattr(self, name)(url, data)
166 except socket.error, msg:
167 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000168
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000169 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000170 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000171 type, url = splittype(fullurl)
172 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000173
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000174 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000175 def retrieve(self, url, filename=None, reporthook=None, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000176 """retrieve(url) returns (filename, None) for a local object
177 or (tempfilename, headers) for a remote object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000178 url = unwrap(url)
179 if self.tempcache and self.tempcache.has_key(url):
180 return self.tempcache[url]
181 type, url1 = splittype(url)
182 if not filename and (not type or type == 'file'):
183 try:
184 fp = self.open_local_file(url1)
185 hdrs = fp.info()
186 del fp
187 return url2pathname(splithost(url1)[1]), hdrs
188 except IOError, msg:
189 pass
Fred Drake316a7932000-08-24 01:01:26 +0000190 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000191 headers = fp.info()
192 if not filename:
193 import tempfile
194 garbage, path = splittype(url)
195 garbage, path = splithost(path or "")
196 path, garbage = splitquery(path or "")
197 path, garbage = splitattr(path or "")
198 suffix = os.path.splitext(path)[1]
199 filename = tempfile.mktemp(suffix)
200 self.__tempfiles.append(filename)
201 result = filename, headers
202 if self.tempcache is not None:
203 self.tempcache[url] = result
204 tfp = open(filename, 'wb')
205 bs = 1024*8
206 size = -1
207 blocknum = 1
208 if reporthook:
209 if headers.has_key("content-length"):
210 size = int(headers["Content-Length"])
211 reporthook(0, bs, size)
212 block = fp.read(bs)
213 if reporthook:
214 reporthook(1, bs, size)
215 while block:
216 tfp.write(block)
217 block = fp.read(bs)
218 blocknum = blocknum + 1
219 if reporthook:
220 reporthook(blocknum, bs, size)
221 fp.close()
222 tfp.close()
223 del fp
224 del tfp
225 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000226
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000227 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000228
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000229 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000230 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000231 import httplib
232 user_passwd = None
233 if type(url) is type(""):
234 host, selector = splithost(url)
235 if host:
236 user_passwd, host = splituser(host)
237 host = unquote(host)
238 realhost = host
239 else:
240 host, selector = url
241 urltype, rest = splittype(selector)
242 url = rest
243 user_passwd = None
244 if string.lower(urltype) != 'http':
245 realhost = None
246 else:
247 realhost, rest = splithost(rest)
248 if realhost:
249 user_passwd, realhost = splituser(realhost)
250 if user_passwd:
251 selector = "%s://%s%s" % (urltype, realhost, rest)
252 #print "proxy via http:", host, selector
253 if not host: raise IOError, ('http error', 'no host given')
254 if user_passwd:
255 import base64
256 auth = string.strip(base64.encodestring(user_passwd))
257 else:
258 auth = None
259 h = httplib.HTTP(host)
260 if data is not None:
261 h.putrequest('POST', selector)
262 h.putheader('Content-type', 'application/x-www-form-urlencoded')
263 h.putheader('Content-length', '%d' % len(data))
264 else:
265 h.putrequest('GET', selector)
266 if auth: h.putheader('Authorization', 'Basic %s' % auth)
267 if realhost: h.putheader('Host', realhost)
268 for args in self.addheaders: apply(h.putheader, args)
269 h.endheaders()
270 if data is not None:
271 h.send(data + '\r\n')
272 errcode, errmsg, headers = h.getreply()
273 fp = h.getfile()
274 if errcode == 200:
275 return addinfourl(fp, headers, "http:" + url)
276 else:
277 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000278 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000279 else:
280 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000281
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000282 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000283 """Handle http errors.
284 Derived class can override this, or provide specific handlers
285 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000286 # First check if there's a specific handler for this error
287 name = 'http_error_%d' % errcode
288 if hasattr(self, name):
289 method = getattr(self, name)
290 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000291 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000292 else:
293 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000294 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000295 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000296
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000297 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000298 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000299 void = fp.read()
300 fp.close()
301 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000302
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000303 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000304 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000305 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000306 import httplib
Fred Drake567ca8e2000-08-21 21:42:42 +0000307 user_passwd = None
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000308 if type(url) is type(""):
309 host, selector = splithost(url)
Fred Drake567ca8e2000-08-21 21:42:42 +0000310 if host:
311 user_passwd, host = splituser(host)
312 host = unquote(host)
313 realhost = host
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000314 else:
315 host, selector = url
316 urltype, rest = splittype(selector)
Fred Drake567ca8e2000-08-21 21:42:42 +0000317 url = rest
318 user_passwd = None
319 if string.lower(urltype) != 'https':
320 realhost = None
321 else:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000322 realhost, rest = splithost(rest)
Fred Drake567ca8e2000-08-21 21:42:42 +0000323 if realhost:
324 user_passwd, realhost = splituser(realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000325 if user_passwd:
326 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000327 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000328 if not host: raise IOError, ('https error', 'no host given')
329 if user_passwd:
330 import base64
331 auth = string.strip(base64.encodestring(user_passwd))
332 else:
333 auth = None
334 h = httplib.HTTPS(host, 0,
335 key_file=self.key_file,
336 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000337 if data is not None:
338 h.putrequest('POST', selector)
339 h.putheader('Content-type',
340 'application/x-www-form-urlencoded')
341 h.putheader('Content-length', '%d' % len(data))
342 else:
343 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000344 if auth: h.putheader('Authorization: Basic %s' % auth)
Fred Drake567ca8e2000-08-21 21:42:42 +0000345 if realhost: h.putheader('Host', realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000346 for args in self.addheaders: apply(h.putheader, args)
347 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000348 if data is not None:
349 h.send(data + '\r\n')
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000350 errcode, errmsg, headers = h.getreply()
351 fp = h.getfile()
352 if errcode == 200:
353 return addinfourl(fp, headers, url)
354 else:
Fred Drake567ca8e2000-08-21 21:42:42 +0000355 if data is None:
356 return self.http_error(url, fp, errcode, errmsg, headers)
357 else:
358 return self.http_error(url, fp, errcode, errmsg, headers, data)
359
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000360 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000361 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000362 import gopherlib
363 host, selector = splithost(url)
364 if not host: raise IOError, ('gopher error', 'no host given')
365 host = unquote(host)
366 type, selector = splitgophertype(selector)
367 selector, query = splitquery(selector)
368 selector = unquote(selector)
369 if query:
370 query = unquote(query)
371 fp = gopherlib.send_query(selector, query, host)
372 else:
373 fp = gopherlib.send_selector(selector, host)
374 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000375
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000376 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000377 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000378 if url[:2] == '//' and url[2:3] != '/':
379 return self.open_ftp(url)
380 else:
381 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000382
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000383 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000384 """Use local file."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000385 import mimetypes, mimetools, StringIO
386 mtype = mimetypes.guess_type(url)[0]
387 headers = mimetools.Message(StringIO.StringIO(
388 'Content-Type: %s\n' % (mtype or 'text/plain')))
389 host, file = splithost(url)
390 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000391 urlfile = file
392 if file[:1] == '/':
393 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000394 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000395 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000396 host, port = splitport(host)
397 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000398 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000399 urlfile = file
400 if file[:1] == '/':
401 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000402 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000403 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000404 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000405
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000406 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000407 """Use FTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000408 host, path = splithost(url)
409 if not host: raise IOError, ('ftp error', 'no host given')
410 host, port = splitport(host)
411 user, host = splituser(host)
412 if user: user, passwd = splitpasswd(user)
413 else: passwd = None
414 host = unquote(host)
415 user = unquote(user or '')
416 passwd = unquote(passwd or '')
417 host = socket.gethostbyname(host)
418 if not port:
419 import ftplib
420 port = ftplib.FTP_PORT
421 else:
422 port = int(port)
423 path, attrs = splitattr(path)
424 path = unquote(path)
425 dirs = string.splitfields(path, '/')
426 dirs, file = dirs[:-1], dirs[-1]
427 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000428 if dirs and not dirs[0]: dirs[0] = '/'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +0000429 key = user, host, port, string.join(dirs, '/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000430 # XXX thread unsafe!
431 if len(self.ftpcache) > MAXFTPCACHE:
432 # Prune the cache, rather arbitrarily
433 for k in self.ftpcache.keys():
434 if k != key:
435 v = self.ftpcache[k]
436 del self.ftpcache[k]
437 v.close()
438 try:
439 if not self.ftpcache.has_key(key):
440 self.ftpcache[key] = \
441 ftpwrapper(user, passwd, host, port, dirs)
442 if not file: type = 'D'
443 else: type = 'I'
444 for attr in attrs:
445 attr, value = splitvalue(attr)
446 if string.lower(attr) == 'type' and \
447 value in ('a', 'A', 'i', 'I', 'd', 'D'):
448 type = string.upper(value)
449 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
450 if retrlen is not None and retrlen >= 0:
451 import mimetools, StringIO
452 headers = mimetools.Message(StringIO.StringIO(
453 'Content-Length: %d\n' % retrlen))
454 else:
455 headers = noheaders()
456 return addinfourl(fp, headers, "ftp:" + url)
457 except ftperrors(), msg:
458 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000459
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000460 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000461 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000462 # ignore POSTed data
463 #
464 # syntax of data URLs:
465 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
466 # mediatype := [ type "/" subtype ] *( ";" parameter )
467 # data := *urlchar
468 # parameter := attribute "=" value
469 import StringIO, mimetools, time
470 try:
471 [type, data] = string.split(url, ',', 1)
472 except ValueError:
473 raise IOError, ('data error', 'bad data URL')
474 if not type:
475 type = 'text/plain;charset=US-ASCII'
476 semi = string.rfind(type, ';')
477 if semi >= 0 and '=' not in type[semi:]:
478 encoding = type[semi+1:]
479 type = type[:semi]
480 else:
481 encoding = ''
482 msg = []
483 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
484 time.gmtime(time.time())))
485 msg.append('Content-type: %s' % type)
486 if encoding == 'base64':
487 import base64
488 data = base64.decodestring(data)
489 else:
490 data = unquote(data)
491 msg.append('Content-length: %d' % len(data))
492 msg.append('')
493 msg.append(data)
494 msg = string.join(msg, '\n')
495 f = StringIO.StringIO(msg)
496 headers = mimetools.Message(f, 0)
497 f.fileno = None # needed for addinfourl
498 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000499
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000500
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000501class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000502 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000503
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000504 def __init__(self, *args):
505 apply(URLopener.__init__, (self,) + args)
506 self.auth_cache = {}
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000507
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000508 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000509 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000510 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000511
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000512 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000513 """Error 302 -- relocated (temporarily)."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000514 # XXX The server can force infinite recursion here!
515 if headers.has_key('location'):
516 newurl = headers['location']
517 elif headers.has_key('uri'):
518 newurl = headers['uri']
519 else:
520 return
521 void = fp.read()
522 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000523 # In case the server sent a relative URL, join with original:
524 newurl = basejoin("http:" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000525 if data is None:
526 return self.open(newurl)
527 else:
528 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000529
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000530 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000531 """Error 301 -- also relocated (permanently)."""
532 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000533
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000534 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000535 """Error 401 -- authentication required.
536 See this URL for a description of the basic authentication scheme:
537 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000538 if headers.has_key('www-authenticate'):
539 stuff = headers['www-authenticate']
540 import re
541 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
542 if match:
543 scheme, realm = match.groups()
544 if string.lower(scheme) == 'basic':
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000545 name = 'retry_' + self.type + '_basic_auth'
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000546 if data is None:
547 return getattr(self,name)(url, realm)
548 else:
549 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000550
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000551 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000552 host, selector = splithost(url)
553 i = string.find(host, '@') + 1
554 host = host[i:]
555 user, passwd = self.get_user_passwd(host, realm, i)
556 if not (user or passwd): return None
557 host = user + ':' + passwd + '@' + host
558 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000559 if data is None:
560 return self.open(newurl)
561 else:
562 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000563
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000564 def retry_https_basic_auth(self, url, realm, data=None):
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000565 host, selector = splithost(url)
566 i = string.find(host, '@') + 1
567 host = host[i:]
568 user, passwd = self.get_user_passwd(host, realm, i)
569 if not (user or passwd): return None
570 host = user + ':' + passwd + '@' + host
571 newurl = '//' + host + selector
572 return self.open_https(newurl)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000573
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000574 def get_user_passwd(self, host, realm, clear_cache = 0):
575 key = realm + '@' + string.lower(host)
576 if self.auth_cache.has_key(key):
577 if clear_cache:
578 del self.auth_cache[key]
579 else:
580 return self.auth_cache[key]
581 user, passwd = self.prompt_user_passwd(host, realm)
582 if user or passwd: self.auth_cache[key] = (user, passwd)
583 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000584
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000585 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000586 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000587 import getpass
588 try:
589 user = raw_input("Enter username for %s at %s: " % (realm,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000590 host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000591 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
592 (user, realm, host))
593 return user, passwd
594 except KeyboardInterrupt:
595 print
596 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000597
598
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000599# Utility functions
600
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000601_localhost = None
602def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000603 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000604 global _localhost
605 if not _localhost:
606 _localhost = socket.gethostbyname('localhost')
607 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000608
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000609_thishost = None
610def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000611 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000612 global _thishost
613 if not _thishost:
614 _thishost = socket.gethostbyname(socket.gethostname())
615 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000616
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000617_ftperrors = None
618def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000619 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000620 global _ftperrors
621 if not _ftperrors:
622 import ftplib
623 _ftperrors = ftplib.all_errors
624 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000625
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000626_noheaders = None
627def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000628 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000629 global _noheaders
630 if not _noheaders:
631 import mimetools
632 import StringIO
633 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
634 _noheaders.fp.close() # Recycle file descriptor
635 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000636
637
638# Utility classes
639
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000640class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000641 """Class used by open_ftp() for cache of open FTP connections."""
642
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000643 def __init__(self, user, passwd, host, port, dirs):
644 self.user = user
645 self.passwd = passwd
646 self.host = host
647 self.port = port
648 self.dirs = dirs
649 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000650
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000651 def init(self):
652 import ftplib
653 self.busy = 0
654 self.ftp = ftplib.FTP()
655 self.ftp.connect(self.host, self.port)
656 self.ftp.login(self.user, self.passwd)
657 for dir in self.dirs:
658 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000659
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000660 def retrfile(self, file, type):
661 import ftplib
662 self.endtransfer()
663 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
664 else: cmd = 'TYPE ' + type; isdir = 0
665 try:
666 self.ftp.voidcmd(cmd)
667 except ftplib.all_errors:
668 self.init()
669 self.ftp.voidcmd(cmd)
670 conn = None
671 if file and not isdir:
672 # Use nlst to see if the file exists at all
673 try:
674 self.ftp.nlst(file)
675 except ftplib.error_perm, reason:
676 raise IOError, ('ftp error', reason), sys.exc_info()[2]
677 # Restore the transfer mode!
678 self.ftp.voidcmd(cmd)
679 # Try to retrieve as a file
680 try:
681 cmd = 'RETR ' + file
682 conn = self.ftp.ntransfercmd(cmd)
683 except ftplib.error_perm, reason:
684 if reason[:3] != '550':
685 raise IOError, ('ftp error', reason), sys.exc_info()[2]
686 if not conn:
687 # Set transfer mode to ASCII!
688 self.ftp.voidcmd('TYPE A')
689 # Try a directory listing
690 if file: cmd = 'LIST ' + file
691 else: cmd = 'LIST'
692 conn = self.ftp.ntransfercmd(cmd)
693 self.busy = 1
694 # Pass back both a suitably decorated object and a retrieval length
695 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000696 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000697 def endtransfer(self):
698 if not self.busy:
699 return
700 self.busy = 0
701 try:
702 self.ftp.voidresp()
703 except ftperrors():
704 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000705
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000706 def close(self):
707 self.endtransfer()
708 try:
709 self.ftp.close()
710 except ftperrors():
711 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000712
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000713class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000714 """Base class for addinfo and addclosehook."""
715
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000716 def __init__(self, fp):
717 self.fp = fp
718 self.read = self.fp.read
719 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000720 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
721 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000722
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000723 def __repr__(self):
724 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000725 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000726
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000727 def close(self):
728 self.read = None
729 self.readline = None
730 self.readlines = None
731 self.fileno = None
732 if self.fp: self.fp.close()
733 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000734
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000735class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000736 """Class to add a close hook to an open file."""
737
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000738 def __init__(self, fp, closehook, *hookargs):
739 addbase.__init__(self, fp)
740 self.closehook = closehook
741 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000742
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000743 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000744 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000745 if self.closehook:
746 apply(self.closehook, self.hookargs)
747 self.closehook = None
748 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000749
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000750class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000751 """class to add an info() method to an open file."""
752
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000753 def __init__(self, fp, headers):
754 addbase.__init__(self, fp)
755 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000756
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000757 def info(self):
758 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000759
Guido van Rossume6ad8911996-09-10 17:02:56 +0000760class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000761 """class to add info() and geturl() methods to an open file."""
762
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000763 def __init__(self, fp, headers, url):
764 addbase.__init__(self, fp)
765 self.headers = headers
766 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000767
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000768 def info(self):
769 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000770
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000771 def geturl(self):
772 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000773
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000774
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000775def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000776 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000777 type, path = splittype(url)
778 if type:
779 # if url is complete (i.e., it contains a type), return it
780 return url
781 host, path = splithost(path)
782 type, basepath = splittype(base) # inherit type from base
783 if host:
784 # if url contains host, just inherit type
785 if type: return type + '://' + host + path
786 else:
787 # no type inherited, so url must have started with //
788 # just return it
789 return url
790 host, basepath = splithost(basepath) # inherit host
Thomas Wouters7e474022000-07-16 12:04:32 +0000791 basepath, basetag = splittag(basepath) # remove extraneous cruft
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000792 basepath, basequery = splitquery(basepath) # idem
793 if path[:1] != '/':
794 # non-absolute path name
795 if path[:1] in ('#', '?'):
796 # path is just a tag or query, attach to basepath
797 i = len(basepath)
798 else:
799 # else replace last component
800 i = string.rfind(basepath, '/')
801 if i < 0:
802 # basepath not absolute
803 if host:
804 # host present, make absolute
805 basepath = '/'
806 else:
807 # else keep non-absolute
808 basepath = ''
809 else:
810 # remove last file component
811 basepath = basepath[:i+1]
812 # Interpret ../ (important because of symlinks)
813 while basepath and path[:3] == '../':
814 path = path[3:]
815 i = string.rfind(basepath[:-1], '/')
816 if i > 0:
817 basepath = basepath[:i+1]
818 elif i == 0:
819 basepath = '/'
820 break
821 else:
822 basepath = ''
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000823
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000824 path = basepath + path
825 if type and host: return type + '://' + host + path
826 elif type: return type + ':' + path
827 elif host: return '//' + host + path # don't know what this means
828 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000829
830
Guido van Rossum7c395db1994-07-04 22:14:49 +0000831# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000832# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000833# splittype('type:opaquestring') --> 'type', 'opaquestring'
834# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000835# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
836# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000837# splitport('host:port') --> 'host', 'port'
838# splitquery('/path?query') --> '/path', 'query'
839# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000840# splitattr('/path;attr1=value1;attr2=value2;...') ->
841# '/path', ['attr1=value1', 'attr2=value2', ...]
842# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000843# splitgophertype('/Xselector') --> 'X', 'selector'
844# unquote('abc%20def') -> 'abc def'
845# quote('abc def') -> 'abc%20def')
846
847def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000848 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000849 url = string.strip(url)
850 if url[:1] == '<' and url[-1:] == '>':
851 url = string.strip(url[1:-1])
852 if url[:4] == 'URL:': url = string.strip(url[4:])
853 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000854
Guido van Rossum332e1441997-09-29 23:23:46 +0000855_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000856def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000857 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000858 global _typeprog
859 if _typeprog is None:
860 import re
861 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000862
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000863 match = _typeprog.match(url)
864 if match:
865 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000866 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000867 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000868
Guido van Rossum332e1441997-09-29 23:23:46 +0000869_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000870def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000871 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000872 global _hostprog
873 if _hostprog is None:
874 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000875 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000876
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000877 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000878 if match: return match.group(1, 2)
879 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000880
Guido van Rossum332e1441997-09-29 23:23:46 +0000881_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000882def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000883 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000884 global _userprog
885 if _userprog is None:
886 import re
887 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000888
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000889 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000890 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000891 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000892
Guido van Rossum332e1441997-09-29 23:23:46 +0000893_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000894def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000895 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000896 global _passwdprog
897 if _passwdprog is None:
898 import re
899 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000900
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000901 match = _passwdprog.match(user)
902 if match: return match.group(1, 2)
903 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000904
Guido van Rossume7b146f2000-02-04 15:28:42 +0000905# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000906_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000907def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000908 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000909 global _portprog
910 if _portprog is None:
911 import re
912 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000913
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000914 match = _portprog.match(host)
915 if match: return match.group(1, 2)
916 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000917
Guido van Rossum332e1441997-09-29 23:23:46 +0000918_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000919def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000920 """Split host and port, returning numeric port.
921 Return given default port if no ':' found; defaults to -1.
922 Return numerical port if a valid number are found after ':'.
923 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000924 global _nportprog
925 if _nportprog is None:
926 import re
927 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000928
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000929 match = _nportprog.match(host)
930 if match:
931 host, port = match.group(1, 2)
932 try:
933 if not port: raise string.atoi_error, "no digits"
934 nport = string.atoi(port)
935 except string.atoi_error:
936 nport = None
937 return host, nport
938 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +0000939
Guido van Rossum332e1441997-09-29 23:23:46 +0000940_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000941def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000942 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000943 global _queryprog
944 if _queryprog is None:
945 import re
946 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000947
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000948 match = _queryprog.match(url)
949 if match: return match.group(1, 2)
950 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000951
Guido van Rossum332e1441997-09-29 23:23:46 +0000952_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000953def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000954 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000955 global _tagprog
956 if _tagprog is None:
957 import re
958 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000959
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000960 match = _tagprog.match(url)
961 if match: return match.group(1, 2)
962 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000963
Guido van Rossum7c395db1994-07-04 22:14:49 +0000964def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000965 """splitattr('/path;attr1=value1;attr2=value2;...') ->
966 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000967 words = string.splitfields(url, ';')
968 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +0000969
Guido van Rossum332e1441997-09-29 23:23:46 +0000970_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000971def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000972 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000973 global _valueprog
974 if _valueprog is None:
975 import re
976 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000977
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000978 match = _valueprog.match(attr)
979 if match: return match.group(1, 2)
980 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000981
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000982def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000983 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000984 if selector[:1] == '/' and selector[1:2]:
985 return selector[1], selector[2:]
986 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000987
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000988def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000989 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000990 mychr = chr
991 myatoi = string.atoi
992 list = string.split(s, '%')
993 res = [list[0]]
994 myappend = res.append
995 del list[0]
996 for item in list:
997 if item[1:2]:
998 try:
999 myappend(mychr(myatoi(item[:2], 16))
1000 + item[2:])
1001 except:
1002 myappend('%' + item)
1003 else:
1004 myappend('%' + item)
1005 return string.join(res, "")
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001006
Guido van Rossum0564e121996-12-13 14:47:36 +00001007def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001008 """unquote('%7e/abc+def') -> '~/abc def'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001009 if '+' in s:
1010 # replace '+' with ' '
1011 s = string.join(string.split(s, '+'), ' ')
1012 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001013
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001014always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001015 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001016 '0123456789' '_.-')
1017
1018_fast_safe_test = always_safe + '/'
1019_fast_safe = None
1020
1021def _fast_quote(s):
1022 global _fast_safe
1023 if _fast_safe is None:
1024 _fast_safe = {}
1025 for c in _fast_safe_test:
1026 _fast_safe[c] = c
1027 res = list(s)
1028 for i in range(len(res)):
1029 c = res[i]
1030 if not _fast_safe.has_key(c):
1031 res[i] = '%%%02x' % ord(c)
1032 return string.join(res, '')
1033
Guido van Rossum7c395db1994-07-04 22:14:49 +00001034def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001035 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001036
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001037 Each part of a URL, e.g. the path info, the query, etc., has a
1038 different set of reserved characters that must be quoted.
1039
1040 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1041 the following reserved characters.
1042
1043 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1044 "$" | ","
1045
1046 Each of these characters is reserved in some component of a URL,
1047 but not necessarily in all of them.
1048
1049 By default, the quote function is intended for quoting the path
1050 section of a URL. Thus, it will not encode '/'. This character
1051 is reserved, but in typical usage the quote function is being
1052 called on a path where the existing slash characters are used as
1053 reserved characters.
1054 """
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001055 safe = always_safe + safe
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001056 if _fast_safe_test == safe:
1057 return _fast_quote(s)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001058 res = list(s)
1059 for i in range(len(res)):
1060 c = res[i]
1061 if c not in safe:
1062 res[i] = '%%%02x' % ord(c)
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001063 return string.join(res, '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001064
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001065def quote_plus(s, safe = ''):
1066 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001067 if ' ' in s:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001068 l = string.split(s, ' ')
1069 for i in range(len(l)):
1070 l[i] = quote(l[i], safe)
1071 return string.join(l, '+')
1072 else:
1073 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001074
Guido van Rossum810a3391998-07-22 21:33:23 +00001075def urlencode(dict):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001076 """Encode a dictionary of form entries into a URL query string."""
1077 l = []
1078 for k, v in dict.items():
1079 k = quote_plus(str(k))
1080 v = quote_plus(str(v))
1081 l.append(k + '=' + v)
1082 return string.join(l, '&')
Guido van Rossum810a3391998-07-22 21:33:23 +00001083
Guido van Rossum442e7201996-03-20 15:33:11 +00001084# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001085def getproxies_environment():
1086 """Return a dictionary of scheme -> proxy server URL mappings.
1087
1088 Scan the environment for variables named <scheme>_proxy;
1089 this seems to be the standard convention. If you need a
1090 different way, you can pass a proxies dictionary to the
1091 [Fancy]URLopener constructor.
1092
1093 """
1094 proxies = {}
1095 for name, value in os.environ.items():
1096 name = string.lower(name)
1097 if value and name[-6:] == '_proxy':
1098 proxies[name[:-6]] = value
1099 return proxies
1100
Guido van Rossum4163e701998-08-06 13:39:09 +00001101if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001102 def getproxies():
1103 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001104
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001105 By convention the mac uses Internet Config to store
1106 proxies. An HTTP proxy, for instance, is stored under
1107 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001108
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001109 """
1110 try:
1111 import ic
1112 except ImportError:
1113 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001114
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001115 try:
1116 config = ic.IC()
1117 except ic.error:
1118 return {}
1119 proxies = {}
1120 # HTTP:
1121 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1122 try:
1123 value = config['HTTPProxyHost']
1124 except ic.error:
1125 pass
1126 else:
1127 proxies['http'] = 'http://%s' % value
1128 # FTP: XXXX To be done.
1129 # Gopher: XXXX To be done.
1130 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001131
1132elif os.name == 'nt':
1133 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001134 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001135
1136 Win32 uses the registry to store proxies.
1137
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001138 """
1139 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001140 try:
1141 import _winreg
1142 except ImportError:
1143 # Std module, so should be around - but you never know!
1144 return proxies
1145 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001146 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1147 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001148 proxyEnable = _winreg.QueryValueEx(internetSettings,
1149 'ProxyEnable')[0]
1150 if proxyEnable:
1151 # Returned as Unicode but problems if not converted to ASCII
1152 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1153 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001154 if '=' in proxyServer:
1155 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001156 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001157 protocol, address = p.split('=', 1)
Mark Hammond4f570b92000-07-26 07:04:38 +00001158 proxies[protocol] = '%s://%s' % (protocol, address)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001159 else:
1160 # Use one setting for all protocols
1161 if proxyServer[:5] == 'http:':
1162 proxies['http'] = proxyServer
1163 else:
1164 proxies['http'] = 'http://%s' % proxyServer
1165 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001166 internetSettings.Close()
1167 except (WindowsError, ValueError, TypeError):
1168 # Either registry key not found etc, or the value in an
1169 # unexpected format.
1170 # proxies already set up to be empty so nothing to do
1171 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001172 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001173
Mark Hammond4f570b92000-07-26 07:04:38 +00001174 def getproxies():
1175 """Return a dictionary of scheme -> proxy server URL mappings.
1176
1177 Returns settings gathered from the environment, if specified,
1178 or the registry.
1179
1180 """
1181 return getproxies_environment() or getproxies_registry()
1182else:
1183 # By default use environment variables
1184 getproxies = getproxies_environment
1185
Guido van Rossum442e7201996-03-20 15:33:11 +00001186
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001187# Test and time quote() and unquote()
1188def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001189 import time
1190 s = ''
1191 for i in range(256): s = s + chr(i)
1192 s = s*4
1193 t0 = time.time()
1194 qs = quote(s)
1195 uqs = unquote(qs)
1196 t1 = time.time()
1197 if uqs != s:
1198 print 'Wrong!'
1199 print `s`
1200 print `qs`
1201 print `uqs`
1202 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001203
1204
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001205def reporthook(blocknum, blocksize, totalsize):
1206 # Report during remote transfers
1207 print "Block number: %d, Block size: %d, Total size: %d" % (blocknum, blocksize, totalsize)
1208
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001209# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001210def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001211 if not args:
1212 args = [
1213 '/etc/passwd',
1214 'file:/etc/passwd',
1215 'file://localhost/etc/passwd',
1216 'ftp://ftp.python.org/etc/passwd',
1217## 'gopher://gopher.micro.umn.edu/1/',
1218 'http://www.python.org/index.html',
1219 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001220 if hasattr(URLopener, "open_https"):
1221 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001222 try:
1223 for url in args:
1224 print '-'*10, url, '-'*10
1225 fn, h = urlretrieve(url, None, reporthook)
1226 print fn, h
1227 if h:
1228 print '======'
1229 for k in h.keys(): print k + ':', h[k]
1230 print '======'
1231 fp = open(fn, 'rb')
1232 data = fp.read()
1233 del fp
1234 if '\r' in data:
1235 table = string.maketrans("", "")
1236 data = string.translate(data, table, "\r")
1237 print data
1238 fn, h = None, None
1239 print '-'*40
1240 finally:
1241 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001242
Guido van Rossum23490151998-06-25 02:39:00 +00001243def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001244 import getopt, sys
1245 try:
1246 opts, args = getopt.getopt(sys.argv[1:], "th")
1247 except getopt.error, msg:
1248 print msg
1249 print "Use -h for help"
1250 return
1251 t = 0
1252 for o, a in opts:
1253 if o == '-t':
1254 t = t + 1
1255 if o == '-h':
1256 print "Usage: python urllib.py [-t] [url ...]"
1257 print "-t runs self-test;",
1258 print "otherwise, contents of urls are printed"
1259 return
1260 if t:
1261 if t > 1:
1262 test1()
1263 test(args)
1264 else:
1265 if not args:
1266 print "Use -h for help"
1267 for url in args:
1268 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001269
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001270# Run test program when run as a script
1271if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001272 main()