blob: d474cfb8ab60819e878c43fd56450480d84a6fc8 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossum3c8484e1996-11-20 22:02:24 +000028import sys
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000029
30
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000031__version__ = '1.12' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000032
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000033MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000034
Jack Jansendc3e3f61995-12-15 13:22:13 +000035# Helper for non-unix systems
36if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000037 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000038elif os.name == 'nt':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000039 from nturl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000040else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000041 def url2pathname(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000042 return unquote(pathname)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000043 def pathname2url(pathname):
Guido van Rossum367ac801999-03-12 14:31:10 +000044 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000045
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000046# This really consists of two pieces:
47# (1) a class which handles opening of all sorts of URLs
48# (plus assorted utilities etc.)
49# (2) a set of functions for parsing URLs
50# XXX Should these be separated out into different modules?
51
52
53# Shortcut for basic usage
54_urlopener = None
Guido van Rossumbd013741996-12-10 16:00:28 +000055def urlopen(url, data=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000056 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000057 global _urlopener
58 if not _urlopener:
59 _urlopener = FancyURLopener()
60 if data is None:
61 return _urlopener.open(url)
62 else:
63 return _urlopener.open(url, data)
Guido van Rossum9ab96d41998-09-28 14:07:00 +000064def urlretrieve(url, filename=None, reporthook=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000065 global _urlopener
66 if not _urlopener:
67 _urlopener = FancyURLopener()
68 return _urlopener.retrieve(url, filename, reporthook)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000069def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000070 if _urlopener:
71 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000072
73
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000074ftpcache = {}
75class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +000076 """Class to open URLs.
77 This is a class rather than just a subroutine because we may need
78 more than one set of global protocol-specific options.
79 Note -- this is a base class for those who don't want the
80 automatic handling of errors type 302 (relocated) and 401
81 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000082
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000083 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +000084
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000085 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000086 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000087 if proxies is None:
88 proxies = getproxies()
89 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
90 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +000091 self.key_file = x509.get('key_file')
92 self.cert_file = x509.get('cert_file')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000093 server_version = "Python-urllib/%s" % __version__
94 self.addheaders = [('User-agent', server_version)]
95 self.__tempfiles = []
96 self.__unlink = os.unlink # See cleanup()
97 self.tempcache = None
98 # Undocumented feature: if you assign {} to tempcache,
99 # it is used to cache files retrieved with
100 # self.retrieve(). This is not enabled by default
101 # since it does not work for changing documents (and I
102 # haven't got the logic to check expiration headers
103 # yet).
104 self.ftpcache = ftpcache
105 # Undocumented feature: you can use a different
106 # ftp cache by assigning to the .ftpcache member;
107 # in case you want logically independent URL openers
108 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000109
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000110 def __del__(self):
111 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000112
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000113 def close(self):
114 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000115
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000116 def cleanup(self):
117 # This code sometimes runs when the rest of this module
118 # has already been deleted, so it can't use any globals
119 # or import anything.
120 if self.__tempfiles:
121 for file in self.__tempfiles:
122 try:
123 self.__unlink(file)
124 except:
125 pass
126 del self.__tempfiles[:]
127 if self.tempcache:
128 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000129
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000130 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000131 """Add a header to be used by the HTTP interface only
132 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000133 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000134
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000135 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000136 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000137 """Use URLopener().open(file) instead of open(file, 'r')."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000138 fullurl = unwrap(fullurl)
139 if self.tempcache and self.tempcache.has_key(fullurl):
140 filename, headers = self.tempcache[fullurl]
141 fp = open(filename, 'rb')
142 return addinfourl(fp, headers, fullurl)
143 type, url = splittype(fullurl)
144 if not type: type = 'file'
145 if self.proxies.has_key(type):
146 proxy = self.proxies[type]
147 type, proxy = splittype(proxy)
148 host, selector = splithost(proxy)
149 url = (host, fullurl) # Signal special case to open_*()
150 name = 'open_' + type
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000151 self.type = type
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000152 if '-' in name:
153 # replace - with _
154 name = string.join(string.split(name, '-'), '_')
155 if not hasattr(self, name):
156 if data is None:
157 return self.open_unknown(fullurl)
158 else:
159 return self.open_unknown(fullurl, data)
160 try:
161 if data is None:
162 return getattr(self, name)(url)
163 else:
164 return getattr(self, name)(url, data)
165 except socket.error, msg:
166 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000167
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000168 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000169 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000170 type, url = splittype(fullurl)
171 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000172
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000173 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000174 def retrieve(self, url, filename=None, reporthook=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000175 """retrieve(url) returns (filename, None) for a local object
176 or (tempfilename, headers) for a remote object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000177 url = unwrap(url)
178 if self.tempcache and self.tempcache.has_key(url):
179 return self.tempcache[url]
180 type, url1 = splittype(url)
181 if not filename and (not type or type == 'file'):
182 try:
183 fp = self.open_local_file(url1)
184 hdrs = fp.info()
185 del fp
186 return url2pathname(splithost(url1)[1]), hdrs
187 except IOError, msg:
188 pass
189 fp = self.open(url)
190 headers = fp.info()
191 if not filename:
192 import tempfile
193 garbage, path = splittype(url)
194 garbage, path = splithost(path or "")
195 path, garbage = splitquery(path or "")
196 path, garbage = splitattr(path or "")
197 suffix = os.path.splitext(path)[1]
198 filename = tempfile.mktemp(suffix)
199 self.__tempfiles.append(filename)
200 result = filename, headers
201 if self.tempcache is not None:
202 self.tempcache[url] = result
203 tfp = open(filename, 'wb')
204 bs = 1024*8
205 size = -1
206 blocknum = 1
207 if reporthook:
208 if headers.has_key("content-length"):
209 size = int(headers["Content-Length"])
210 reporthook(0, bs, size)
211 block = fp.read(bs)
212 if reporthook:
213 reporthook(1, bs, size)
214 while block:
215 tfp.write(block)
216 block = fp.read(bs)
217 blocknum = blocknum + 1
218 if reporthook:
219 reporthook(blocknum, bs, size)
220 fp.close()
221 tfp.close()
222 del fp
223 del tfp
224 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000225
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000226 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000227
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000228 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000229 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000230 import httplib
231 user_passwd = None
232 if type(url) is type(""):
233 host, selector = splithost(url)
234 if host:
235 user_passwd, host = splituser(host)
236 host = unquote(host)
237 realhost = host
238 else:
239 host, selector = url
240 urltype, rest = splittype(selector)
241 url = rest
242 user_passwd = None
243 if string.lower(urltype) != 'http':
244 realhost = None
245 else:
246 realhost, rest = splithost(rest)
247 if realhost:
248 user_passwd, realhost = splituser(realhost)
249 if user_passwd:
250 selector = "%s://%s%s" % (urltype, realhost, rest)
251 #print "proxy via http:", host, selector
252 if not host: raise IOError, ('http error', 'no host given')
253 if user_passwd:
254 import base64
255 auth = string.strip(base64.encodestring(user_passwd))
256 else:
257 auth = None
258 h = httplib.HTTP(host)
259 if data is not None:
260 h.putrequest('POST', selector)
261 h.putheader('Content-type', 'application/x-www-form-urlencoded')
262 h.putheader('Content-length', '%d' % len(data))
263 else:
264 h.putrequest('GET', selector)
265 if auth: h.putheader('Authorization', 'Basic %s' % auth)
266 if realhost: h.putheader('Host', realhost)
267 for args in self.addheaders: apply(h.putheader, args)
268 h.endheaders()
269 if data is not None:
270 h.send(data + '\r\n')
271 errcode, errmsg, headers = h.getreply()
272 fp = h.getfile()
273 if errcode == 200:
274 return addinfourl(fp, headers, "http:" + url)
275 else:
276 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000277 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000278 else:
279 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000280
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000281 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000282 """Handle http errors.
283 Derived class can override this, or provide specific handlers
284 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000285 # First check if there's a specific handler for this error
286 name = 'http_error_%d' % errcode
287 if hasattr(self, name):
288 method = getattr(self, name)
289 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000290 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000291 else:
292 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000293 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000294 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000295
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000296 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000297 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000298 void = fp.read()
299 fp.close()
300 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000301
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000302 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000303 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000304 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000305 import httplib
Fred Drake567ca8e2000-08-21 21:42:42 +0000306 user_passwd = None
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000307 if type(url) is type(""):
308 host, selector = splithost(url)
Fred Drake567ca8e2000-08-21 21:42:42 +0000309 if host:
310 user_passwd, host = splituser(host)
311 host = unquote(host)
312 realhost = host
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000313 else:
314 host, selector = url
315 urltype, rest = splittype(selector)
Fred Drake567ca8e2000-08-21 21:42:42 +0000316 url = rest
317 user_passwd = None
318 if string.lower(urltype) != 'https':
319 realhost = None
320 else:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000321 realhost, rest = splithost(rest)
Fred Drake567ca8e2000-08-21 21:42:42 +0000322 if realhost:
323 user_passwd, realhost = splituser(realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000324 if user_passwd:
325 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000326 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000327 if not host: raise IOError, ('https error', 'no host given')
328 if user_passwd:
329 import base64
330 auth = string.strip(base64.encodestring(user_passwd))
331 else:
332 auth = None
333 h = httplib.HTTPS(host, 0,
334 key_file=self.key_file,
335 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000336 if data is not None:
337 h.putrequest('POST', selector)
338 h.putheader('Content-type',
339 'application/x-www-form-urlencoded')
340 h.putheader('Content-length', '%d' % len(data))
341 else:
342 h.putrequest('GET', selector)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000343 if auth: h.putheader('Authorization: Basic %s' % auth)
Fred Drake567ca8e2000-08-21 21:42:42 +0000344 if realhost: h.putheader('Host', realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000345 for args in self.addheaders: apply(h.putheader, args)
346 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000347 if data is not None:
348 h.send(data + '\r\n')
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000349 errcode, errmsg, headers = h.getreply()
350 fp = h.getfile()
351 if errcode == 200:
352 return addinfourl(fp, headers, url)
353 else:
Fred Drake567ca8e2000-08-21 21:42:42 +0000354 if data is None:
355 return self.http_error(url, fp, errcode, errmsg, headers)
356 else:
357 return self.http_error(url, fp, errcode, errmsg, headers, data)
358
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000359 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000360 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000361 import gopherlib
362 host, selector = splithost(url)
363 if not host: raise IOError, ('gopher error', 'no host given')
364 host = unquote(host)
365 type, selector = splitgophertype(selector)
366 selector, query = splitquery(selector)
367 selector = unquote(selector)
368 if query:
369 query = unquote(query)
370 fp = gopherlib.send_query(selector, query, host)
371 else:
372 fp = gopherlib.send_selector(selector, host)
373 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000374
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000375 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000376 """Use local file or FTP depending on form of URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000377 if url[:2] == '//' and url[2:3] != '/':
378 return self.open_ftp(url)
379 else:
380 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000381
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000382 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000383 """Use local file."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000384 import mimetypes, mimetools, StringIO
385 mtype = mimetypes.guess_type(url)[0]
386 headers = mimetools.Message(StringIO.StringIO(
387 'Content-Type: %s\n' % (mtype or 'text/plain')))
388 host, file = splithost(url)
389 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000390 urlfile = file
391 if file[:1] == '/':
392 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000393 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000394 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000395 host, port = splitport(host)
396 if not port \
397 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000398 urlfile = file
399 if file[:1] == '/':
400 urlfile = 'file://' + file
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000401 return addinfourl(open(url2pathname(file), 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000402 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000403 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000404
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000405 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000406 """Use FTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000407 host, path = splithost(url)
408 if not host: raise IOError, ('ftp error', 'no host given')
409 host, port = splitport(host)
410 user, host = splituser(host)
411 if user: user, passwd = splitpasswd(user)
412 else: passwd = None
413 host = unquote(host)
414 user = unquote(user or '')
415 passwd = unquote(passwd or '')
416 host = socket.gethostbyname(host)
417 if not port:
418 import ftplib
419 port = ftplib.FTP_PORT
420 else:
421 port = int(port)
422 path, attrs = splitattr(path)
423 path = unquote(path)
424 dirs = string.splitfields(path, '/')
425 dirs, file = dirs[:-1], dirs[-1]
426 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000427 if dirs and not dirs[0]: dirs[0] = '/'
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000428 key = (user, host, port, string.joinfields(dirs, '/'))
429 # XXX thread unsafe!
430 if len(self.ftpcache) > MAXFTPCACHE:
431 # Prune the cache, rather arbitrarily
432 for k in self.ftpcache.keys():
433 if k != key:
434 v = self.ftpcache[k]
435 del self.ftpcache[k]
436 v.close()
437 try:
438 if not self.ftpcache.has_key(key):
439 self.ftpcache[key] = \
440 ftpwrapper(user, passwd, host, port, dirs)
441 if not file: type = 'D'
442 else: type = 'I'
443 for attr in attrs:
444 attr, value = splitvalue(attr)
445 if string.lower(attr) == 'type' and \
446 value in ('a', 'A', 'i', 'I', 'd', 'D'):
447 type = string.upper(value)
448 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
449 if retrlen is not None and retrlen >= 0:
450 import mimetools, StringIO
451 headers = mimetools.Message(StringIO.StringIO(
452 'Content-Length: %d\n' % retrlen))
453 else:
454 headers = noheaders()
455 return addinfourl(fp, headers, "ftp:" + url)
456 except ftperrors(), msg:
457 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000458
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000459 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000460 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000461 # ignore POSTed data
462 #
463 # syntax of data URLs:
464 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
465 # mediatype := [ type "/" subtype ] *( ";" parameter )
466 # data := *urlchar
467 # parameter := attribute "=" value
468 import StringIO, mimetools, time
469 try:
470 [type, data] = string.split(url, ',', 1)
471 except ValueError:
472 raise IOError, ('data error', 'bad data URL')
473 if not type:
474 type = 'text/plain;charset=US-ASCII'
475 semi = string.rfind(type, ';')
476 if semi >= 0 and '=' not in type[semi:]:
477 encoding = type[semi+1:]
478 type = type[:semi]
479 else:
480 encoding = ''
481 msg = []
482 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
483 time.gmtime(time.time())))
484 msg.append('Content-type: %s' % type)
485 if encoding == 'base64':
486 import base64
487 data = base64.decodestring(data)
488 else:
489 data = unquote(data)
490 msg.append('Content-length: %d' % len(data))
491 msg.append('')
492 msg.append(data)
493 msg = string.join(msg, '\n')
494 f = StringIO.StringIO(msg)
495 headers = mimetools.Message(f, 0)
496 f.fileno = None # needed for addinfourl
497 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000498
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000499
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000500class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000501 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000502
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000503 def __init__(self, *args):
504 apply(URLopener.__init__, (self,) + args)
505 self.auth_cache = {}
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000506
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000507 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000508 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000509 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000510
Guido van Rossume7b146f2000-02-04 15:28:42 +0000511 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
512 """Error 302 -- relocated (temporarily)."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000513 # XXX The server can force infinite recursion here!
514 if headers.has_key('location'):
515 newurl = headers['location']
516 elif headers.has_key('uri'):
517 newurl = headers['uri']
518 else:
519 return
520 void = fp.read()
521 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000522 # In case the server sent a relative URL, join with original:
523 newurl = basejoin("http:" + url, newurl)
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000524 if data is None:
525 return self.open(newurl)
526 else:
527 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000528
Guido van Rossume7b146f2000-02-04 15:28:42 +0000529 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
530 """Error 301 -- also relocated (permanently)."""
531 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000532
Guido van Rossume7b146f2000-02-04 15:28:42 +0000533 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
534 """Error 401 -- authentication required.
535 See this URL for a description of the basic authentication scheme:
536 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000537 if headers.has_key('www-authenticate'):
538 stuff = headers['www-authenticate']
539 import re
540 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
541 if match:
542 scheme, realm = match.groups()
543 if string.lower(scheme) == 'basic':
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000544 name = 'retry_' + self.type + '_basic_auth'
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000545 if data is None:
546 return getattr(self,name)(url, realm)
547 else:
548 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000549
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000550 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000551 host, selector = splithost(url)
552 i = string.find(host, '@') + 1
553 host = host[i:]
554 user, passwd = self.get_user_passwd(host, realm, i)
555 if not (user or passwd): return None
556 host = user + ':' + passwd + '@' + host
557 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000558 if data is None:
559 return self.open(newurl)
560 else:
561 return self.open(newurl, data)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000562
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000563 def retry_https_basic_auth(self, url, realm, data=None):
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000564 host, selector = splithost(url)
565 i = string.find(host, '@') + 1
566 host = host[i:]
567 user, passwd = self.get_user_passwd(host, realm, i)
568 if not (user or passwd): return None
569 host = user + ':' + passwd + '@' + host
570 newurl = '//' + host + selector
571 return self.open_https(newurl)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000572
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000573 def get_user_passwd(self, host, realm, clear_cache = 0):
574 key = realm + '@' + string.lower(host)
575 if self.auth_cache.has_key(key):
576 if clear_cache:
577 del self.auth_cache[key]
578 else:
579 return self.auth_cache[key]
580 user, passwd = self.prompt_user_passwd(host, realm)
581 if user or passwd: self.auth_cache[key] = (user, passwd)
582 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000583
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000584 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000585 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000586 import getpass
587 try:
588 user = raw_input("Enter username for %s at %s: " % (realm,
589 host))
590 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
591 (user, realm, host))
592 return user, passwd
593 except KeyboardInterrupt:
594 print
595 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000596
597
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000598# Utility functions
599
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000600_localhost = None
601def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000602 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000603 global _localhost
604 if not _localhost:
605 _localhost = socket.gethostbyname('localhost')
606 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000607
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000608_thishost = None
609def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000610 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000611 global _thishost
612 if not _thishost:
613 _thishost = socket.gethostbyname(socket.gethostname())
614 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000615
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000616_ftperrors = None
617def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000618 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000619 global _ftperrors
620 if not _ftperrors:
621 import ftplib
622 _ftperrors = ftplib.all_errors
623 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000624
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000625_noheaders = None
626def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000627 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000628 global _noheaders
629 if not _noheaders:
630 import mimetools
631 import StringIO
632 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
633 _noheaders.fp.close() # Recycle file descriptor
634 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000635
636
637# Utility classes
638
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000639class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000640 """Class used by open_ftp() for cache of open FTP connections."""
641
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000642 def __init__(self, user, passwd, host, port, dirs):
643 self.user = user
644 self.passwd = passwd
645 self.host = host
646 self.port = port
647 self.dirs = dirs
648 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000649
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000650 def init(self):
651 import ftplib
652 self.busy = 0
653 self.ftp = ftplib.FTP()
654 self.ftp.connect(self.host, self.port)
655 self.ftp.login(self.user, self.passwd)
656 for dir in self.dirs:
657 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000658
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000659 def retrfile(self, file, type):
660 import ftplib
661 self.endtransfer()
662 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
663 else: cmd = 'TYPE ' + type; isdir = 0
664 try:
665 self.ftp.voidcmd(cmd)
666 except ftplib.all_errors:
667 self.init()
668 self.ftp.voidcmd(cmd)
669 conn = None
670 if file and not isdir:
671 # Use nlst to see if the file exists at all
672 try:
673 self.ftp.nlst(file)
674 except ftplib.error_perm, reason:
675 raise IOError, ('ftp error', reason), sys.exc_info()[2]
676 # Restore the transfer mode!
677 self.ftp.voidcmd(cmd)
678 # Try to retrieve as a file
679 try:
680 cmd = 'RETR ' + file
681 conn = self.ftp.ntransfercmd(cmd)
682 except ftplib.error_perm, reason:
683 if reason[:3] != '550':
684 raise IOError, ('ftp error', reason), sys.exc_info()[2]
685 if not conn:
686 # Set transfer mode to ASCII!
687 self.ftp.voidcmd('TYPE A')
688 # Try a directory listing
689 if file: cmd = 'LIST ' + file
690 else: cmd = 'LIST'
691 conn = self.ftp.ntransfercmd(cmd)
692 self.busy = 1
693 # Pass back both a suitably decorated object and a retrieval length
694 return (addclosehook(conn[0].makefile('rb'),
695 self.endtransfer), conn[1])
696 def endtransfer(self):
697 if not self.busy:
698 return
699 self.busy = 0
700 try:
701 self.ftp.voidresp()
702 except ftperrors():
703 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000704
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000705 def close(self):
706 self.endtransfer()
707 try:
708 self.ftp.close()
709 except ftperrors():
710 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000711
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000712class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000713 """Base class for addinfo and addclosehook."""
714
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000715 def __init__(self, fp):
716 self.fp = fp
717 self.read = self.fp.read
718 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000719 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
720 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
Guido van Rossume7b146f2000-02-04 15:28:42 +0000721
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000722 def __repr__(self):
723 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
724 `id(self)`, `self.fp`)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000725
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000726 def close(self):
727 self.read = None
728 self.readline = None
729 self.readlines = None
730 self.fileno = None
731 if self.fp: self.fp.close()
732 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000733
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000734class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000735 """Class to add a close hook to an open file."""
736
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000737 def __init__(self, fp, closehook, *hookargs):
738 addbase.__init__(self, fp)
739 self.closehook = closehook
740 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000741
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000742 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000743 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000744 if self.closehook:
745 apply(self.closehook, self.hookargs)
746 self.closehook = None
747 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000748
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000749class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000750 """class to add an info() method to an open file."""
751
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000752 def __init__(self, fp, headers):
753 addbase.__init__(self, fp)
754 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000755
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000756 def info(self):
757 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000758
Guido van Rossume6ad8911996-09-10 17:02:56 +0000759class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000760 """class to add info() and geturl() methods to an open file."""
761
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000762 def __init__(self, fp, headers, url):
763 addbase.__init__(self, fp)
764 self.headers = headers
765 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000766
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000767 def info(self):
768 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000769
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000770 def geturl(self):
771 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000772
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000773
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000774def basejoin(base, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000775 """Utility to combine a URL with a base URL to form a new URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000776 type, path = splittype(url)
777 if type:
778 # if url is complete (i.e., it contains a type), return it
779 return url
780 host, path = splithost(path)
781 type, basepath = splittype(base) # inherit type from base
782 if host:
783 # if url contains host, just inherit type
784 if type: return type + '://' + host + path
785 else:
786 # no type inherited, so url must have started with //
787 # just return it
788 return url
789 host, basepath = splithost(basepath) # inherit host
Thomas Wouters7e474022000-07-16 12:04:32 +0000790 basepath, basetag = splittag(basepath) # remove extraneous cruft
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000791 basepath, basequery = splitquery(basepath) # idem
792 if path[:1] != '/':
793 # non-absolute path name
794 if path[:1] in ('#', '?'):
795 # path is just a tag or query, attach to basepath
796 i = len(basepath)
797 else:
798 # else replace last component
799 i = string.rfind(basepath, '/')
800 if i < 0:
801 # basepath not absolute
802 if host:
803 # host present, make absolute
804 basepath = '/'
805 else:
806 # else keep non-absolute
807 basepath = ''
808 else:
809 # remove last file component
810 basepath = basepath[:i+1]
811 # Interpret ../ (important because of symlinks)
812 while basepath and path[:3] == '../':
813 path = path[3:]
814 i = string.rfind(basepath[:-1], '/')
815 if i > 0:
816 basepath = basepath[:i+1]
817 elif i == 0:
818 basepath = '/'
819 break
820 else:
821 basepath = ''
822
823 path = basepath + path
824 if type and host: return type + '://' + host + path
825 elif type: return type + ':' + path
826 elif host: return '//' + host + path # don't know what this means
827 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000828
829
Guido van Rossum7c395db1994-07-04 22:14:49 +0000830# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000831# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000832# splittype('type:opaquestring') --> 'type', 'opaquestring'
833# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000834# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
835# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000836# splitport('host:port') --> 'host', 'port'
837# splitquery('/path?query') --> '/path', 'query'
838# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000839# splitattr('/path;attr1=value1;attr2=value2;...') ->
840# '/path', ['attr1=value1', 'attr2=value2', ...]
841# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000842# splitgophertype('/Xselector') --> 'X', 'selector'
843# unquote('abc%20def') -> 'abc def'
844# quote('abc def') -> 'abc%20def')
845
846def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000847 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000848 url = string.strip(url)
849 if url[:1] == '<' and url[-1:] == '>':
850 url = string.strip(url[1:-1])
851 if url[:4] == 'URL:': url = string.strip(url[4:])
852 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000853
Guido van Rossum332e1441997-09-29 23:23:46 +0000854_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000855def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000856 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000857 global _typeprog
858 if _typeprog is None:
859 import re
860 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000861
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000862 match = _typeprog.match(url)
863 if match:
864 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000865 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000866 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000867
Guido van Rossum332e1441997-09-29 23:23:46 +0000868_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000869def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000870 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000871 global _hostprog
872 if _hostprog is None:
873 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000874 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000875
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000876 match = _hostprog.match(url)
877 if match: return match.group(1, 2)
878 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000879
Guido van Rossum332e1441997-09-29 23:23:46 +0000880_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000881def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000882 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000883 global _userprog
884 if _userprog is None:
885 import re
886 _userprog = re.compile('^([^@]*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000887
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000888 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000889 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000890 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000891
Guido van Rossum332e1441997-09-29 23:23:46 +0000892_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000893def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000894 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000895 global _passwdprog
896 if _passwdprog is None:
897 import re
898 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000899
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000900 match = _passwdprog.match(user)
901 if match: return match.group(1, 2)
902 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000903
Guido van Rossume7b146f2000-02-04 15:28:42 +0000904# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000905_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000906def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000907 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000908 global _portprog
909 if _portprog is None:
910 import re
911 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000912
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000913 match = _portprog.match(host)
914 if match: return match.group(1, 2)
915 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000916
Guido van Rossum332e1441997-09-29 23:23:46 +0000917_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000918def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000919 """Split host and port, returning numeric port.
920 Return given default port if no ':' found; defaults to -1.
921 Return numerical port if a valid number are found after ':'.
922 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000923 global _nportprog
924 if _nportprog is None:
925 import re
926 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000927
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000928 match = _nportprog.match(host)
929 if match:
930 host, port = match.group(1, 2)
931 try:
932 if not port: raise string.atoi_error, "no digits"
933 nport = string.atoi(port)
934 except string.atoi_error:
935 nport = None
936 return host, nport
937 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +0000938
Guido van Rossum332e1441997-09-29 23:23:46 +0000939_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000940def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000941 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000942 global _queryprog
943 if _queryprog is None:
944 import re
945 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000946
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000947 match = _queryprog.match(url)
948 if match: return match.group(1, 2)
949 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000950
Guido van Rossum332e1441997-09-29 23:23:46 +0000951_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000952def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000953 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000954 global _tagprog
955 if _tagprog is None:
956 import re
957 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +0000958
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000959 match = _tagprog.match(url)
960 if match: return match.group(1, 2)
961 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000962
Guido van Rossum7c395db1994-07-04 22:14:49 +0000963def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000964 """splitattr('/path;attr1=value1;attr2=value2;...') ->
965 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000966 words = string.splitfields(url, ';')
967 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +0000968
Guido van Rossum332e1441997-09-29 23:23:46 +0000969_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000970def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000971 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000972 global _valueprog
973 if _valueprog is None:
974 import re
975 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000976
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000977 match = _valueprog.match(attr)
978 if match: return match.group(1, 2)
979 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000980
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000981def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000982 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000983 if selector[:1] == '/' and selector[1:2]:
984 return selector[1], selector[2:]
985 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000986
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000987def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000988 """unquote('abc%20def') -> 'abc def'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000989 mychr = chr
990 myatoi = string.atoi
991 list = string.split(s, '%')
992 res = [list[0]]
993 myappend = res.append
994 del list[0]
995 for item in list:
996 if item[1:2]:
997 try:
998 myappend(mychr(myatoi(item[:2], 16))
999 + item[2:])
1000 except:
1001 myappend('%' + item)
1002 else:
1003 myappend('%' + item)
1004 return string.join(res, "")
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001005
Guido van Rossum0564e121996-12-13 14:47:36 +00001006def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001007 """unquote('%7e/abc+def') -> '~/abc def'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001008 if '+' in s:
1009 # replace '+' with ' '
1010 s = string.join(string.split(s, '+'), ' ')
1011 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001012
Guido van Rossum3bb54481994-08-29 10:52:58 +00001013always_safe = string.letters + string.digits + '_,.-'
Guido van Rossum7c395db1994-07-04 22:14:49 +00001014def quote(s, safe = '/'):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001015 """quote('abc def') -> 'abc%20def'."""
Guido van Rossum0dee4ee1999-06-09 15:14:50 +00001016 # XXX Can speed this up an order of magnitude
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001017 safe = always_safe + safe
1018 res = list(s)
1019 for i in range(len(res)):
1020 c = res[i]
1021 if c not in safe:
1022 res[i] = '%%%02x' % ord(c)
1023 return string.joinfields(res, '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001024
Guido van Rossum0564e121996-12-13 14:47:36 +00001025def quote_plus(s, safe = '/'):
Guido van Rossum0dee4ee1999-06-09 15:14:50 +00001026 # XXX Can speed this up an order of magnitude
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001027 if ' ' in s:
1028 # replace ' ' with '+'
1029 l = string.split(s, ' ')
1030 for i in range(len(l)):
1031 l[i] = quote(l[i], safe)
1032 return string.join(l, '+')
1033 else:
1034 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001035
Guido van Rossum810a3391998-07-22 21:33:23 +00001036def urlencode(dict):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001037 """Encode a dictionary of form entries into a URL query string."""
1038 l = []
1039 for k, v in dict.items():
1040 k = quote_plus(str(k))
1041 v = quote_plus(str(v))
1042 l.append(k + '=' + v)
1043 return string.join(l, '&')
Guido van Rossum810a3391998-07-22 21:33:23 +00001044
Guido van Rossum442e7201996-03-20 15:33:11 +00001045
1046# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001047def getproxies_environment():
1048 """Return a dictionary of scheme -> proxy server URL mappings.
1049
1050 Scan the environment for variables named <scheme>_proxy;
1051 this seems to be the standard convention. If you need a
1052 different way, you can pass a proxies dictionary to the
1053 [Fancy]URLopener constructor.
1054
1055 """
1056 proxies = {}
1057 for name, value in os.environ.items():
1058 name = string.lower(name)
1059 if value and name[-6:] == '_proxy':
1060 proxies[name[:-6]] = value
1061 return proxies
1062
Guido van Rossum4163e701998-08-06 13:39:09 +00001063if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001064 def getproxies():
1065 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001066
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001067 By convention the mac uses Internet Config to store
1068 proxies. An HTTP proxy, for instance, is stored under
1069 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001070
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001071 """
1072 try:
1073 import ic
1074 except ImportError:
1075 return {}
1076
1077 try:
1078 config = ic.IC()
1079 except ic.error:
1080 return {}
1081 proxies = {}
1082 # HTTP:
1083 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1084 try:
1085 value = config['HTTPProxyHost']
1086 except ic.error:
1087 pass
1088 else:
1089 proxies['http'] = 'http://%s' % value
1090 # FTP: XXXX To be done.
1091 # Gopher: XXXX To be done.
1092 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001093
1094elif os.name == 'nt':
1095 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001096 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001097
1098 Win32 uses the registry to store proxies.
1099
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001100 """
1101 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001102 try:
1103 import _winreg
1104 except ImportError:
1105 # Std module, so should be around - but you never know!
1106 return proxies
1107 try:
1108 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1109 'Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings')
1110 proxyEnable = _winreg.QueryValueEx(internetSettings,
1111 'ProxyEnable')[0]
1112 if proxyEnable:
1113 # Returned as Unicode but problems if not converted to ASCII
1114 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1115 'ProxyServer')[0])
1116 if ';' in proxyServer: # Per-protocol settings
1117 for p in proxyServer.split(';'):
1118 protocol, address = p.split('=')
1119 proxies[protocol] = '%s://%s' % (protocol, address)
1120 else: # Use one setting for all protocols
1121 proxies['http'] = 'http://%s' % proxyServer
1122 proxies['ftp'] = 'ftp://%s' % proxyServer
1123 internetSettings.Close()
1124 except (WindowsError, ValueError, TypeError):
1125 # Either registry key not found etc, or the value in an
1126 # unexpected format.
1127 # proxies already set up to be empty so nothing to do
1128 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001129 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001130
Mark Hammond4f570b92000-07-26 07:04:38 +00001131 def getproxies():
1132 """Return a dictionary of scheme -> proxy server URL mappings.
1133
1134 Returns settings gathered from the environment, if specified,
1135 or the registry.
1136
1137 """
1138 return getproxies_environment() or getproxies_registry()
1139else:
1140 # By default use environment variables
1141 getproxies = getproxies_environment
1142
Guido van Rossum442e7201996-03-20 15:33:11 +00001143
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001144# Test and time quote() and unquote()
1145def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001146 import time
1147 s = ''
1148 for i in range(256): s = s + chr(i)
1149 s = s*4
1150 t0 = time.time()
1151 qs = quote(s)
1152 uqs = unquote(qs)
1153 t1 = time.time()
1154 if uqs != s:
1155 print 'Wrong!'
1156 print `s`
1157 print `qs`
1158 print `uqs`
1159 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001160
1161
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001162def reporthook(blocknum, blocksize, totalsize):
1163 # Report during remote transfers
1164 print "Block number: %d, Block size: %d, Total size: %d" % (blocknum, blocksize, totalsize)
1165
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001166# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001167def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001168 if not args:
1169 args = [
1170 '/etc/passwd',
1171 'file:/etc/passwd',
1172 'file://localhost/etc/passwd',
1173 'ftp://ftp.python.org/etc/passwd',
1174## 'gopher://gopher.micro.umn.edu/1/',
1175 'http://www.python.org/index.html',
1176 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001177 if hasattr(URLopener, "open_https"):
1178 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001179 try:
1180 for url in args:
1181 print '-'*10, url, '-'*10
1182 fn, h = urlretrieve(url, None, reporthook)
1183 print fn, h
1184 if h:
1185 print '======'
1186 for k in h.keys(): print k + ':', h[k]
1187 print '======'
1188 fp = open(fn, 'rb')
1189 data = fp.read()
1190 del fp
1191 if '\r' in data:
1192 table = string.maketrans("", "")
1193 data = string.translate(data, table, "\r")
1194 print data
1195 fn, h = None, None
1196 print '-'*40
1197 finally:
1198 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001199
Guido van Rossum23490151998-06-25 02:39:00 +00001200def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001201 import getopt, sys
1202 try:
1203 opts, args = getopt.getopt(sys.argv[1:], "th")
1204 except getopt.error, msg:
1205 print msg
1206 print "Use -h for help"
1207 return
1208 t = 0
1209 for o, a in opts:
1210 if o == '-t':
1211 t = t + 1
1212 if o == '-h':
1213 print "Usage: python urllib.py [-t] [url ...]"
1214 print "-t runs self-test;",
1215 print "otherwise, contents of urls are printed"
1216 return
1217 if t:
1218 if t > 1:
1219 test1()
1220 test(args)
1221 else:
1222 if not args:
1223 print "Use -h for help"
1224 for url in args:
1225 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001226
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001227# Run test program when run as a script
1228if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001229 main()