blob: 802d9b760e1bd6e6f2d5936ab84085b25726449e [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Guido van Rossum7c395db1994-07-04 22:14:49 +000025import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000026import socket
Jack Jansendc3e3f61995-12-15 13:22:13 +000027import os
Guido van Rossumf0713d32001-08-09 17:43:35 +000028import time
Guido van Rossum3c8484e1996-11-20 22:02:24 +000029import sys
Brett Cannon69200fa2004-03-23 21:26:39 +000030from urlparse import urljoin as basejoin
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000031
Skip Montanaro40fc1602001-03-01 04:27:19 +000032__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
Skip Montanaro44d5e0c2001-03-13 19:47:16 +000034 "urlencode", "url2pathname", "pathname2url", "splittag",
35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37 "splitnport", "splitquery", "splitattr", "splitvalue",
38 "splitgophertype", "getproxies"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000039
Brett Cannon69200fa2004-03-23 21:26:39 +000040__version__ = '1.16' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000041
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000042MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000043
Jack Jansendc3e3f61995-12-15 13:22:13 +000044# Helper for non-unix systems
45if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000046 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000047elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000048 from nturl2path import url2pathname, pathname2url
Guido van Rossumd74fb6b2001-03-02 06:43:49 +000049elif os.name == 'riscos':
50 from rourl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000051else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000052 def url2pathname(pathname):
Georg Brandlc0b24732005-12-26 22:53:56 +000053 """OS-specific conversion from a relative URL of the 'file' scheme
54 to a file system path; not recommended for general use."""
Guido van Rossum367ac801999-03-12 14:31:10 +000055 return unquote(pathname)
Georg Brandlc0b24732005-12-26 22:53:56 +000056
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000057 def pathname2url(pathname):
Georg Brandlc0b24732005-12-26 22:53:56 +000058 """OS-specific conversion from a file system path to a relative URL
59 of the 'file' scheme; not recommended for general use."""
Guido van Rossum367ac801999-03-12 14:31:10 +000060 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000061
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000062# This really consists of two pieces:
63# (1) a class which handles opening of all sorts of URLs
64# (plus assorted utilities etc.)
65# (2) a set of functions for parsing URLs
66# XXX Should these be separated out into different modules?
67
68
69# Shortcut for basic usage
70_urlopener = None
Fred Drakedf6eca72002-04-04 20:41:34 +000071def urlopen(url, data=None, proxies=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000072 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000073 global _urlopener
Fred Drakedf6eca72002-04-04 20:41:34 +000074 if proxies is not None:
75 opener = FancyURLopener(proxies=proxies)
76 elif not _urlopener:
77 opener = FancyURLopener()
78 _urlopener = opener
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000079 else:
Fred Drakedf6eca72002-04-04 20:41:34 +000080 opener = _urlopener
81 if data is None:
82 return opener.open(url)
83 else:
84 return opener.open(url, data)
Fred Drake316a7932000-08-24 01:01:26 +000085def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000086 global _urlopener
87 if not _urlopener:
88 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000089 return _urlopener.retrieve(url, filename, reporthook, data)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000090def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000091 if _urlopener:
92 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000093
Georg Brandlb9256022005-08-24 18:46:39 +000094# exception raised when downloaded size does not match content-length
95class ContentTooShortError(IOError):
96 def __init__(self, message, content):
97 IOError.__init__(self, message)
98 self.content = content
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000099
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000100ftpcache = {}
101class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000102 """Class to open URLs.
103 This is a class rather than just a subroutine because we may need
104 more than one set of global protocol-specific options.
105 Note -- this is a base class for those who don't want the
106 automatic handling of errors type 302 (relocated) and 401
107 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000108
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000109 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +0000110
Guido van Rossumba311382000-08-24 16:18:04 +0000111 version = "Python-urllib/%s" % __version__
112
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000113 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000114 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000115 if proxies is None:
116 proxies = getproxies()
117 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
118 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000119 self.key_file = x509.get('key_file')
120 self.cert_file = x509.get('cert_file')
Guido van Rossumba311382000-08-24 16:18:04 +0000121 self.addheaders = [('User-agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000122 self.__tempfiles = []
123 self.__unlink = os.unlink # See cleanup()
124 self.tempcache = None
125 # Undocumented feature: if you assign {} to tempcache,
126 # it is used to cache files retrieved with
127 # self.retrieve(). This is not enabled by default
128 # since it does not work for changing documents (and I
129 # haven't got the logic to check expiration headers
130 # yet).
131 self.ftpcache = ftpcache
132 # Undocumented feature: you can use a different
133 # ftp cache by assigning to the .ftpcache member;
134 # in case you want logically independent URL openers
135 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000136
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000137 def __del__(self):
138 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000139
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000140 def close(self):
141 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000142
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000143 def cleanup(self):
144 # This code sometimes runs when the rest of this module
145 # has already been deleted, so it can't use any globals
146 # or import anything.
147 if self.__tempfiles:
148 for file in self.__tempfiles:
149 try:
150 self.__unlink(file)
Martin v. Löwis58682b72001-08-11 15:02:57 +0000151 except OSError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000152 pass
153 del self.__tempfiles[:]
154 if self.tempcache:
155 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000156
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000157 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000158 """Add a header to be used by the HTTP interface only
159 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000160 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000161
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000162 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000163 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000164 """Use URLopener().open(file) instead of open(file, 'r')."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000165 fullurl = unwrap(toBytes(fullurl))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000166 if self.tempcache and fullurl in self.tempcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000167 filename, headers = self.tempcache[fullurl]
168 fp = open(filename, 'rb')
169 return addinfourl(fp, headers, fullurl)
Martin v. Löwis1d994332000-12-03 18:30:10 +0000170 urltype, url = splittype(fullurl)
171 if not urltype:
172 urltype = 'file'
Raymond Hettinger54f02222002-06-01 14:18:47 +0000173 if urltype in self.proxies:
Martin v. Löwis1d994332000-12-03 18:30:10 +0000174 proxy = self.proxies[urltype]
175 urltype, proxyhost = splittype(proxy)
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000176 host, selector = splithost(proxyhost)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000177 url = (host, fullurl) # Signal special case to open_*()
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000178 else:
179 proxy = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000180 name = 'open_' + urltype
181 self.type = urltype
Brett Cannonaaeffaf2004-03-23 23:50:17 +0000182 name = name.replace('-', '_')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000183 if not hasattr(self, name):
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000184 if proxy:
185 return self.open_unknown_proxy(proxy, fullurl, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000186 else:
187 return self.open_unknown(fullurl, data)
188 try:
189 if data is None:
190 return getattr(self, name)(url)
191 else:
192 return getattr(self, name)(url, data)
193 except socket.error, msg:
194 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000195
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000196 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000197 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000198 type, url = splittype(fullurl)
199 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000200
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000201 def open_unknown_proxy(self, proxy, fullurl, data=None):
202 """Overridable interface to open unknown URL type."""
203 type, url = splittype(fullurl)
204 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
205
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000206 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000207 def retrieve(self, url, filename=None, reporthook=None, data=None):
Brett Cannon7d618c72003-04-24 02:43:20 +0000208 """retrieve(url) returns (filename, headers) for a local object
Guido van Rossume7b146f2000-02-04 15:28:42 +0000209 or (tempfilename, headers) for a remote object."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000210 url = unwrap(toBytes(url))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000211 if self.tempcache and url in self.tempcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000212 return self.tempcache[url]
213 type, url1 = splittype(url)
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000214 if filename is None and (not type or type == 'file'):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000215 try:
216 fp = self.open_local_file(url1)
217 hdrs = fp.info()
218 del fp
219 return url2pathname(splithost(url1)[1]), hdrs
220 except IOError, msg:
221 pass
Fred Drake316a7932000-08-24 01:01:26 +0000222 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000223 headers = fp.info()
Guido van Rossum3b0a3292002-08-09 16:38:32 +0000224 if filename:
225 tfp = open(filename, 'wb')
226 else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000227 import tempfile
228 garbage, path = splittype(url)
229 garbage, path = splithost(path or "")
230 path, garbage = splitquery(path or "")
231 path, garbage = splitattr(path or "")
232 suffix = os.path.splitext(path)[1]
Guido van Rossum3b0a3292002-08-09 16:38:32 +0000233 (fd, filename) = tempfile.mkstemp(suffix)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000234 self.__tempfiles.append(filename)
Jeremy Hylton3bd6fde2002-10-11 14:36:24 +0000235 tfp = os.fdopen(fd, 'wb')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000236 result = filename, headers
237 if self.tempcache is not None:
238 self.tempcache[url] = result
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000239 bs = 1024*8
240 size = -1
Georg Brandlb9256022005-08-24 18:46:39 +0000241 read = 0
Georg Brandl5a650a22005-08-26 08:51:34 +0000242 blocknum = 0
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000243 if reporthook:
Raymond Hettinger54f02222002-06-01 14:18:47 +0000244 if "content-length" in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000245 size = int(headers["Content-Length"])
Georg Brandl5a650a22005-08-26 08:51:34 +0000246 reporthook(blocknum, bs, size)
247 while 1:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000248 block = fp.read(bs)
Georg Brandl5a650a22005-08-26 08:51:34 +0000249 if block == "":
250 break
Georg Brandlb9256022005-08-24 18:46:39 +0000251 read += len(block)
Georg Brandl5a650a22005-08-26 08:51:34 +0000252 tfp.write(block)
Georg Brandlb9256022005-08-24 18:46:39 +0000253 blocknum += 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000254 if reporthook:
255 reporthook(blocknum, bs, size)
256 fp.close()
257 tfp.close()
258 del fp
259 del tfp
Georg Brandlb9256022005-08-24 18:46:39 +0000260
261 # raise exception if actual size does not match content-length header
262 if size >= 0 and read < size:
263 raise ContentTooShortError("retrieval incomplete: got only %i out "
264 "of %i bytes" % (read, size), result)
265
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000266 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000267
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000268 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000269
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000270 def open_http(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000271 """Use HTTP protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000272 import httplib
273 user_passwd = None
Walter Dörwald65230a22002-06-03 15:58:32 +0000274 if isinstance(url, str):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000275 host, selector = splithost(url)
276 if host:
277 user_passwd, host = splituser(host)
278 host = unquote(host)
279 realhost = host
280 else:
281 host, selector = url
282 urltype, rest = splittype(selector)
283 url = rest
284 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000285 if urltype.lower() != 'http':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000286 realhost = None
287 else:
288 realhost, rest = splithost(rest)
289 if realhost:
290 user_passwd, realhost = splituser(realhost)
291 if user_passwd:
292 selector = "%s://%s%s" % (urltype, realhost, rest)
Tim Peters55c12d42001-08-09 18:04:14 +0000293 if proxy_bypass(realhost):
294 host = realhost
295
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000296 #print "proxy via http:", host, selector
297 if not host: raise IOError, ('http error', 'no host given')
298 if user_passwd:
299 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000300 auth = base64.encodestring(user_passwd).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000301 else:
302 auth = None
303 h = httplib.HTTP(host)
304 if data is not None:
305 h.putrequest('POST', selector)
306 h.putheader('Content-type', 'application/x-www-form-urlencoded')
307 h.putheader('Content-length', '%d' % len(data))
308 else:
309 h.putrequest('GET', selector)
310 if auth: h.putheader('Authorization', 'Basic %s' % auth)
311 if realhost: h.putheader('Host', realhost)
Guido van Rossum68468eb2003-02-27 20:14:51 +0000312 for args in self.addheaders: h.putheader(*args)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000313 h.endheaders()
314 if data is not None:
Fred Drakeec3dfde2001-07-04 05:18:29 +0000315 h.send(data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000316 errcode, errmsg, headers = h.getreply()
317 fp = h.getfile()
318 if errcode == 200:
319 return addinfourl(fp, headers, "http:" + url)
320 else:
321 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000322 return self.http_error(url, fp, errcode, errmsg, headers)
Guido van Rossum29aab751999-03-09 19:31:21 +0000323 else:
324 return self.http_error(url, fp, errcode, errmsg, headers, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000325
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000326 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000327 """Handle http errors.
328 Derived class can override this, or provide specific handlers
329 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000330 # First check if there's a specific handler for this error
331 name = 'http_error_%d' % errcode
332 if hasattr(self, name):
333 method = getattr(self, name)
334 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000335 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000336 else:
337 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000338 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000339 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000340
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000341 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000342 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000343 void = fp.read()
344 fp.close()
345 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000346
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000347 if hasattr(socket, "ssl"):
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000348 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000349 """Use HTTPS protocol."""
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000350 import httplib
Fred Drake567ca8e2000-08-21 21:42:42 +0000351 user_passwd = None
Walter Dörwald65230a22002-06-03 15:58:32 +0000352 if isinstance(url, str):
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000353 host, selector = splithost(url)
Fred Drake567ca8e2000-08-21 21:42:42 +0000354 if host:
355 user_passwd, host = splituser(host)
356 host = unquote(host)
357 realhost = host
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000358 else:
359 host, selector = url
360 urltype, rest = splittype(selector)
Fred Drake567ca8e2000-08-21 21:42:42 +0000361 url = rest
362 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000363 if urltype.lower() != 'https':
Fred Drake567ca8e2000-08-21 21:42:42 +0000364 realhost = None
365 else:
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000366 realhost, rest = splithost(rest)
Fred Drake567ca8e2000-08-21 21:42:42 +0000367 if realhost:
368 user_passwd, realhost = splituser(realhost)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000369 if user_passwd:
370 selector = "%s://%s%s" % (urltype, realhost, rest)
Andrew M. Kuchling7ad47922000-06-10 01:41:48 +0000371 #print "proxy via https:", host, selector
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000372 if not host: raise IOError, ('https error', 'no host given')
373 if user_passwd:
374 import base64
Guido van Rossumb2493f82000-12-15 15:01:37 +0000375 auth = base64.encodestring(user_passwd).strip()
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000376 else:
377 auth = None
378 h = httplib.HTTPS(host, 0,
379 key_file=self.key_file,
380 cert_file=self.cert_file)
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000381 if data is not None:
382 h.putrequest('POST', selector)
383 h.putheader('Content-type',
384 'application/x-www-form-urlencoded')
385 h.putheader('Content-length', '%d' % len(data))
386 else:
387 h.putrequest('GET', selector)
Andrew M. Kuchlingff638ea2003-08-29 18:12:23 +0000388 if auth: h.putheader('Authorization', 'Basic %s' % auth)
Fred Drake567ca8e2000-08-21 21:42:42 +0000389 if realhost: h.putheader('Host', realhost)
Guido van Rossum68468eb2003-02-27 20:14:51 +0000390 for args in self.addheaders: h.putheader(*args)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000391 h.endheaders()
Andrew M. Kuchling43c5af02000-04-24 14:17:06 +0000392 if data is not None:
Fred Drakeec3dfde2001-07-04 05:18:29 +0000393 h.send(data)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000394 errcode, errmsg, headers = h.getreply()
395 fp = h.getfile()
396 if errcode == 200:
Guido van Rossumb931bf32001-12-08 17:09:07 +0000397 return addinfourl(fp, headers, "https:" + url)
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000398 else:
Fred Drake567ca8e2000-08-21 21:42:42 +0000399 if data is None:
400 return self.http_error(url, fp, errcode, errmsg, headers)
401 else:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000402 return self.http_error(url, fp, errcode, errmsg, headers,
403 data)
Fred Drake567ca8e2000-08-21 21:42:42 +0000404
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000405 def open_gopher(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000406 """Use Gopher protocol."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000407 import gopherlib
408 host, selector = splithost(url)
409 if not host: raise IOError, ('gopher error', 'no host given')
410 host = unquote(host)
411 type, selector = splitgophertype(selector)
412 selector, query = splitquery(selector)
413 selector = unquote(selector)
414 if query:
415 query = unquote(query)
416 fp = gopherlib.send_query(selector, query, host)
417 else:
418 fp = gopherlib.send_selector(selector, host)
419 return addinfourl(fp, noheaders(), "gopher:" + url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000420
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000421 def open_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000422 """Use local file or FTP depending on form of URL."""
Jack Jansen4ef11032002-09-12 20:14:04 +0000423 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000424 return self.open_ftp(url)
425 else:
426 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000427
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000428 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000429 """Use local file."""
Raymond Hettingera6172712004-12-31 19:15:26 +0000430 import mimetypes, mimetools, email.Utils
431 try:
432 from cStringIO import StringIO
433 except ImportError:
434 from StringIO import StringIO
Guido van Rossumf0713d32001-08-09 17:43:35 +0000435 host, file = splithost(url)
436 localname = url2pathname(file)
Guido van Rossuma2da3052002-04-15 00:25:01 +0000437 try:
438 stats = os.stat(localname)
439 except OSError, e:
440 raise IOError(e.errno, e.strerror, e.filename)
Walter Dörwald92b48b72002-03-22 17:30:38 +0000441 size = stats.st_size
Anthony Baxter3dd9e462004-10-11 13:53:08 +0000442 modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000443 mtype = mimetypes.guess_type(url)[0]
Raymond Hettingera6172712004-12-31 19:15:26 +0000444 headers = mimetools.Message(StringIO(
Guido van Rossumf0713d32001-08-09 17:43:35 +0000445 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
446 (mtype or 'text/plain', size, modified)))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000447 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000448 urlfile = file
449 if file[:1] == '/':
450 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000451 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000452 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000453 host, port = splitport(host)
454 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000455 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000456 urlfile = file
457 if file[:1] == '/':
458 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000459 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000460 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000461 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000462
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000463 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000464 """Use FTP protocol."""
Raymond Hettingera6172712004-12-31 19:15:26 +0000465 import mimetypes, mimetools
466 try:
467 from cStringIO import StringIO
468 except ImportError:
469 from StringIO import StringIO
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000470 host, path = splithost(url)
471 if not host: raise IOError, ('ftp error', 'no host given')
472 host, port = splitport(host)
473 user, host = splituser(host)
474 if user: user, passwd = splitpasswd(user)
475 else: passwd = None
476 host = unquote(host)
477 user = unquote(user or '')
478 passwd = unquote(passwd or '')
479 host = socket.gethostbyname(host)
480 if not port:
481 import ftplib
482 port = ftplib.FTP_PORT
483 else:
484 port = int(port)
485 path, attrs = splitattr(path)
486 path = unquote(path)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000487 dirs = path.split('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000488 dirs, file = dirs[:-1], dirs[-1]
489 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000490 if dirs and not dirs[0]: dirs[0] = '/'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000491 key = user, host, port, '/'.join(dirs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000492 # XXX thread unsafe!
493 if len(self.ftpcache) > MAXFTPCACHE:
494 # Prune the cache, rather arbitrarily
495 for k in self.ftpcache.keys():
496 if k != key:
497 v = self.ftpcache[k]
498 del self.ftpcache[k]
499 v.close()
500 try:
Raymond Hettinger54f02222002-06-01 14:18:47 +0000501 if not key in self.ftpcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000502 self.ftpcache[key] = \
503 ftpwrapper(user, passwd, host, port, dirs)
504 if not file: type = 'D'
505 else: type = 'I'
506 for attr in attrs:
507 attr, value = splitvalue(attr)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000508 if attr.lower() == 'type' and \
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000509 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000510 type = value.upper()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000511 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000512 mtype = mimetypes.guess_type("ftp:" + url)[0]
513 headers = ""
514 if mtype:
515 headers += "Content-Type: %s\n" % mtype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000516 if retrlen is not None and retrlen >= 0:
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000517 headers += "Content-Length: %d\n" % retrlen
Raymond Hettingera6172712004-12-31 19:15:26 +0000518 headers = mimetools.Message(StringIO(headers))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000519 return addinfourl(fp, headers, "ftp:" + url)
520 except ftperrors(), msg:
521 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000522
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000523 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000524 """Use "data" URL."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000525 # ignore POSTed data
526 #
527 # syntax of data URLs:
528 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
529 # mediatype := [ type "/" subtype ] *( ";" parameter )
530 # data := *urlchar
531 # parameter := attribute "=" value
Raymond Hettingera6172712004-12-31 19:15:26 +0000532 import mimetools
533 try:
534 from cStringIO import StringIO
535 except ImportError:
536 from StringIO import StringIO
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000537 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000538 [type, data] = url.split(',', 1)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000539 except ValueError:
540 raise IOError, ('data error', 'bad data URL')
541 if not type:
542 type = 'text/plain;charset=US-ASCII'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000543 semi = type.rfind(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000544 if semi >= 0 and '=' not in type[semi:]:
545 encoding = type[semi+1:]
546 type = type[:semi]
547 else:
548 encoding = ''
549 msg = []
550 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
551 time.gmtime(time.time())))
552 msg.append('Content-type: %s' % type)
553 if encoding == 'base64':
554 import base64
555 data = base64.decodestring(data)
556 else:
557 data = unquote(data)
558 msg.append('Content-length: %d' % len(data))
559 msg.append('')
560 msg.append(data)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000561 msg = '\n'.join(msg)
Raymond Hettingera6172712004-12-31 19:15:26 +0000562 f = StringIO(msg)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000563 headers = mimetools.Message(f, 0)
Georg Brandl1f663572005-11-26 16:50:44 +0000564 #f.fileno = None # needed for addinfourl
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000565 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000566
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000567
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000568class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000569 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000570
Neal Norwitz60e04cd2002-06-11 13:38:51 +0000571 def __init__(self, *args, **kwargs):
Guido van Rossum68468eb2003-02-27 20:14:51 +0000572 URLopener.__init__(self, *args, **kwargs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000573 self.auth_cache = {}
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000574 self.tries = 0
575 self.maxtries = 10
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000576
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000577 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000578 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000579 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000580
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000581 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000582 """Error 302 -- relocated (temporarily)."""
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000583 self.tries += 1
584 if self.maxtries and self.tries >= self.maxtries:
585 if hasattr(self, "http_error_500"):
586 meth = self.http_error_500
587 else:
588 meth = self.http_error_default
589 self.tries = 0
590 return meth(url, fp, 500,
591 "Internal Server Error: Redirect Recursion", headers)
592 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
593 data)
594 self.tries = 0
595 return result
596
597 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000598 if 'location' in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000599 newurl = headers['location']
Raymond Hettinger54f02222002-06-01 14:18:47 +0000600 elif 'uri' in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000601 newurl = headers['uri']
602 else:
603 return
604 void = fp.read()
605 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000606 # In case the server sent a relative URL, join with original:
Moshe Zadka5d87d472001-04-09 14:54:21 +0000607 newurl = basejoin(self.type + ":" + url, newurl)
Guido van Rossumfa19f7c2003-05-16 01:46:51 +0000608 return self.open(newurl)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000609
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000610 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000611 """Error 301 -- also relocated (permanently)."""
612 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000613
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000614 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
615 """Error 303 -- also relocated (essentially identical to 302)."""
616 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
617
Guido van Rossumfa19f7c2003-05-16 01:46:51 +0000618 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
619 """Error 307 -- relocated, but turn POST into error."""
620 if data is None:
621 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
622 else:
623 return self.http_error_default(url, fp, errcode, errmsg, headers)
624
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000625 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000626 """Error 401 -- authentication required.
627 See this URL for a description of the basic authentication scheme:
628 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
Raymond Hettinger54f02222002-06-01 14:18:47 +0000629 if not 'www-authenticate' in headers:
Tim Peters85ba6732001-02-28 08:26:44 +0000630 URLopener.http_error_default(self, url, fp,
Fred Drakec680ae82001-10-13 18:37:07 +0000631 errcode, errmsg, headers)
Moshe Zadkae99bd172001-02-27 06:27:04 +0000632 stuff = headers['www-authenticate']
633 import re
634 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
635 if not match:
Tim Peters85ba6732001-02-28 08:26:44 +0000636 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000637 errcode, errmsg, headers)
638 scheme, realm = match.groups()
639 if scheme.lower() != 'basic':
Tim Peters85ba6732001-02-28 08:26:44 +0000640 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000641 errcode, errmsg, headers)
642 name = 'retry_' + self.type + '_basic_auth'
643 if data is None:
644 return getattr(self,name)(url, realm)
645 else:
646 return getattr(self,name)(url, realm, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000647
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000648 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000649 host, selector = splithost(url)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000650 i = host.find('@') + 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000651 host = host[i:]
652 user, passwd = self.get_user_passwd(host, realm, i)
653 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000654 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000655 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000656 if data is None:
657 return self.open(newurl)
658 else:
659 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000660
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000661 def retry_https_basic_auth(self, url, realm, data=None):
Tim Peterse1190062001-01-15 03:34:38 +0000662 host, selector = splithost(url)
663 i = host.find('@') + 1
664 host = host[i:]
665 user, passwd = self.get_user_passwd(host, realm, i)
666 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000667 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Tim Peterse1190062001-01-15 03:34:38 +0000668 newurl = '//' + host + selector
Guido van Rossumafc4f042001-01-15 18:31:13 +0000669 return self.open_https(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000670
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000671 def get_user_passwd(self, host, realm, clear_cache = 0):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000672 key = realm + '@' + host.lower()
Raymond Hettinger54f02222002-06-01 14:18:47 +0000673 if key in self.auth_cache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000674 if clear_cache:
675 del self.auth_cache[key]
676 else:
677 return self.auth_cache[key]
678 user, passwd = self.prompt_user_passwd(host, realm)
679 if user or passwd: self.auth_cache[key] = (user, passwd)
680 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000681
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000682 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000683 """Override this in a GUI environment!"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000684 import getpass
685 try:
686 user = raw_input("Enter username for %s at %s: " % (realm,
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000687 host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000688 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
689 (user, realm, host))
690 return user, passwd
691 except KeyboardInterrupt:
692 print
693 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000694
695
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000696# Utility functions
697
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000698_localhost = None
699def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000700 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000701 global _localhost
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000702 if _localhost is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000703 _localhost = socket.gethostbyname('localhost')
704 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000705
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000706_thishost = None
707def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000708 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000709 global _thishost
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000710 if _thishost is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000711 _thishost = socket.gethostbyname(socket.gethostname())
712 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000713
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000714_ftperrors = None
715def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000716 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000717 global _ftperrors
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000718 if _ftperrors is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000719 import ftplib
720 _ftperrors = ftplib.all_errors
721 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000722
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000723_noheaders = None
724def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000725 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000726 global _noheaders
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000727 if _noheaders is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000728 import mimetools
Raymond Hettingera6172712004-12-31 19:15:26 +0000729 try:
730 from cStringIO import StringIO
731 except ImportError:
732 from StringIO import StringIO
733 _noheaders = mimetools.Message(StringIO(), 0)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000734 _noheaders.fp.close() # Recycle file descriptor
735 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000736
737
738# Utility classes
739
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000740class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000741 """Class used by open_ftp() for cache of open FTP connections."""
742
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000743 def __init__(self, user, passwd, host, port, dirs):
744 self.user = user
745 self.passwd = passwd
746 self.host = host
747 self.port = port
748 self.dirs = dirs
749 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000750
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000751 def init(self):
752 import ftplib
753 self.busy = 0
754 self.ftp = ftplib.FTP()
755 self.ftp.connect(self.host, self.port)
756 self.ftp.login(self.user, self.passwd)
757 for dir in self.dirs:
758 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000759
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000760 def retrfile(self, file, type):
761 import ftplib
762 self.endtransfer()
763 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
764 else: cmd = 'TYPE ' + type; isdir = 0
765 try:
766 self.ftp.voidcmd(cmd)
767 except ftplib.all_errors:
768 self.init()
769 self.ftp.voidcmd(cmd)
770 conn = None
771 if file and not isdir:
772 # Use nlst to see if the file exists at all
773 try:
774 self.ftp.nlst(file)
775 except ftplib.error_perm, reason:
776 raise IOError, ('ftp error', reason), sys.exc_info()[2]
777 # Restore the transfer mode!
778 self.ftp.voidcmd(cmd)
779 # Try to retrieve as a file
780 try:
781 cmd = 'RETR ' + file
782 conn = self.ftp.ntransfercmd(cmd)
783 except ftplib.error_perm, reason:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000784 if str(reason)[:3] != '550':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000785 raise IOError, ('ftp error', reason), sys.exc_info()[2]
786 if not conn:
787 # Set transfer mode to ASCII!
788 self.ftp.voidcmd('TYPE A')
789 # Try a directory listing
790 if file: cmd = 'LIST ' + file
791 else: cmd = 'LIST'
792 conn = self.ftp.ntransfercmd(cmd)
793 self.busy = 1
794 # Pass back both a suitably decorated object and a retrieval length
795 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000796 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000797 def endtransfer(self):
798 if not self.busy:
799 return
800 self.busy = 0
801 try:
802 self.ftp.voidresp()
803 except ftperrors():
804 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000805
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000806 def close(self):
807 self.endtransfer()
808 try:
809 self.ftp.close()
810 except ftperrors():
811 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000812
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000813class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000814 """Base class for addinfo and addclosehook."""
815
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000816 def __init__(self, fp):
817 self.fp = fp
818 self.read = self.fp.read
819 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000820 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
Georg Brandl1f663572005-11-26 16:50:44 +0000821 if hasattr(self.fp, "fileno"):
822 self.fileno = self.fp.fileno
823 else:
824 self.fileno = lambda: None
Raymond Hettinger42182eb2003-03-09 05:33:33 +0000825 if hasattr(self.fp, "__iter__"):
826 self.__iter__ = self.fp.__iter__
827 if hasattr(self.fp, "next"):
828 self.next = self.fp.next
Guido van Rossume7b146f2000-02-04 15:28:42 +0000829
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000830 def __repr__(self):
Walter Dörwald70a6b492004-02-12 17:35:32 +0000831 return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
832 id(self), self.fp)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000833
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000834 def close(self):
835 self.read = None
836 self.readline = None
837 self.readlines = None
838 self.fileno = None
839 if self.fp: self.fp.close()
840 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000841
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000842class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000843 """Class to add a close hook to an open file."""
844
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000845 def __init__(self, fp, closehook, *hookargs):
846 addbase.__init__(self, fp)
847 self.closehook = closehook
848 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000849
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000850 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000851 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000852 if self.closehook:
Guido van Rossum68468eb2003-02-27 20:14:51 +0000853 self.closehook(*self.hookargs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000854 self.closehook = None
855 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000856
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000857class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000858 """class to add an info() method to an open file."""
859
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000860 def __init__(self, fp, headers):
861 addbase.__init__(self, fp)
862 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000863
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000864 def info(self):
865 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000866
Guido van Rossume6ad8911996-09-10 17:02:56 +0000867class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000868 """class to add info() and geturl() methods to an open file."""
869
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000870 def __init__(self, fp, headers, url):
871 addbase.__init__(self, fp)
872 self.headers = headers
873 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000874
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000875 def info(self):
876 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000877
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000878 def geturl(self):
879 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000880
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000881
Guido van Rossum7c395db1994-07-04 22:14:49 +0000882# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000883# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000884# splittype('type:opaquestring') --> 'type', 'opaquestring'
885# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000886# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
887# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000888# splitport('host:port') --> 'host', 'port'
889# splitquery('/path?query') --> '/path', 'query'
890# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000891# splitattr('/path;attr1=value1;attr2=value2;...') ->
892# '/path', ['attr1=value1', 'attr2=value2', ...]
893# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000894# splitgophertype('/Xselector') --> 'X', 'selector'
895# unquote('abc%20def') -> 'abc def'
896# quote('abc def') -> 'abc%20def')
897
Walter Dörwald65230a22002-06-03 15:58:32 +0000898try:
899 unicode
900except NameError:
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000901 def _is_unicode(x):
902 return 0
Walter Dörwald65230a22002-06-03 15:58:32 +0000903else:
904 def _is_unicode(x):
905 return isinstance(x, unicode)
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000906
Martin v. Löwis1d994332000-12-03 18:30:10 +0000907def toBytes(url):
908 """toBytes(u"URL") --> 'URL'."""
909 # Most URL schemes require ASCII. If that changes, the conversion
910 # can be relaxed
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000911 if _is_unicode(url):
Martin v. Löwis1d994332000-12-03 18:30:10 +0000912 try:
913 url = url.encode("ASCII")
914 except UnicodeError:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000915 raise UnicodeError("URL " + repr(url) +
916 " contains non-ASCII characters")
Martin v. Löwis1d994332000-12-03 18:30:10 +0000917 return url
918
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000919def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000920 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Guido van Rossumb2493f82000-12-15 15:01:37 +0000921 url = url.strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000922 if url[:1] == '<' and url[-1:] == '>':
Guido van Rossumb2493f82000-12-15 15:01:37 +0000923 url = url[1:-1].strip()
924 if url[:4] == 'URL:': url = url[4:].strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000925 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000926
Guido van Rossum332e1441997-09-29 23:23:46 +0000927_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000928def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000929 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000930 global _typeprog
931 if _typeprog is None:
932 import re
933 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000934
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000935 match = _typeprog.match(url)
936 if match:
937 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000938 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000939 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000940
Guido van Rossum332e1441997-09-29 23:23:46 +0000941_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000942def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000943 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000944 global _hostprog
945 if _hostprog is None:
946 import re
Guido van Rossum3427c1f1999-07-01 23:20:56 +0000947 _hostprog = re.compile('^//([^/]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000948
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000949 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000950 if match: return match.group(1, 2)
951 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000952
Guido van Rossum332e1441997-09-29 23:23:46 +0000953_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000954def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000955 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000956 global _userprog
957 if _userprog is None:
958 import re
Raymond Hettingerf2e45dd2002-08-18 20:08:56 +0000959 _userprog = re.compile('^(.*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000960
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000961 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000962 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000963 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000964
Guido van Rossum332e1441997-09-29 23:23:46 +0000965_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000966def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000967 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000968 global _passwdprog
969 if _passwdprog is None:
970 import re
971 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000972
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000973 match = _passwdprog.match(user)
974 if match: return match.group(1, 2)
975 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000976
Guido van Rossume7b146f2000-02-04 15:28:42 +0000977# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000978_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000979def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000980 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000981 global _portprog
982 if _portprog is None:
983 import re
984 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000985
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000986 match = _portprog.match(host)
987 if match: return match.group(1, 2)
988 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000989
Guido van Rossum332e1441997-09-29 23:23:46 +0000990_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +0000991def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000992 """Split host and port, returning numeric port.
993 Return given default port if no ':' found; defaults to -1.
994 Return numerical port if a valid number are found after ':'.
995 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000996 global _nportprog
997 if _nportprog is None:
998 import re
999 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001000
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001001 match = _nportprog.match(host)
1002 if match:
1003 host, port = match.group(1, 2)
1004 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001005 if not port: raise ValueError, "no digits"
1006 nport = int(port)
1007 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001008 nport = None
1009 return host, nport
1010 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +00001011
Guido van Rossum332e1441997-09-29 23:23:46 +00001012_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001013def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001014 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001015 global _queryprog
1016 if _queryprog is None:
1017 import re
1018 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001019
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001020 match = _queryprog.match(url)
1021 if match: return match.group(1, 2)
1022 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001023
Guido van Rossum332e1441997-09-29 23:23:46 +00001024_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001025def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001026 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001027 global _tagprog
1028 if _tagprog is None:
1029 import re
1030 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001031
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001032 match = _tagprog.match(url)
1033 if match: return match.group(1, 2)
1034 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001035
Guido van Rossum7c395db1994-07-04 22:14:49 +00001036def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001037 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1038 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Guido van Rossumb2493f82000-12-15 15:01:37 +00001039 words = url.split(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001040 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +00001041
Guido van Rossum332e1441997-09-29 23:23:46 +00001042_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001043def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001044 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001045 global _valueprog
1046 if _valueprog is None:
1047 import re
1048 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001049
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001050 match = _valueprog.match(attr)
1051 if match: return match.group(1, 2)
1052 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001053
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001054def splitgophertype(selector):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001055 """splitgophertype('/Xselector') --> 'X', 'selector'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001056 if selector[:1] == '/' and selector[1:2]:
1057 return selector[1], selector[2:]
1058 return None, selector
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001059
Raymond Hettinger803ce802005-09-10 06:49:04 +00001060_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1061_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1062
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001063def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001064 """unquote('abc%20def') -> 'abc def'."""
Raymond Hettinger803ce802005-09-10 06:49:04 +00001065 res = s.split('%')
1066 for i in xrange(1, len(res)):
1067 item = res[i]
1068 try:
1069 res[i] = _hextochr[item[:2]] + item[2:]
1070 except KeyError:
1071 res[i] = '%' + item
Raymond Hettinger4b0f20d2005-10-15 16:41:53 +00001072 except UnicodeDecodeError:
1073 res[i] = unichr(int(item[:2], 16)) + item[2:]
Guido van Rossumb2493f82000-12-15 15:01:37 +00001074 return "".join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001075
Guido van Rossum0564e121996-12-13 14:47:36 +00001076def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001077 """unquote('%7e/abc+def') -> '~/abc def'"""
Brett Cannonaaeffaf2004-03-23 23:50:17 +00001078 s = s.replace('+', ' ')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001079 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001080
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001081always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001082 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001083 '0123456789' '_.-')
Raymond Hettinger199d2f72005-09-09 22:27:13 +00001084_safemaps = {}
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001085
Guido van Rossum7c395db1994-07-04 22:14:49 +00001086def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001087 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001088
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001089 Each part of a URL, e.g. the path info, the query, etc., has a
1090 different set of reserved characters that must be quoted.
1091
1092 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1093 the following reserved characters.
1094
1095 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1096 "$" | ","
1097
1098 Each of these characters is reserved in some component of a URL,
1099 but not necessarily in all of them.
1100
1101 By default, the quote function is intended for quoting the path
1102 section of a URL. Thus, it will not encode '/'. This character
1103 is reserved, but in typical usage the quote function is being
1104 called on a path where the existing slash characters are used as
1105 reserved characters.
1106 """
Raymond Hettinger199d2f72005-09-09 22:27:13 +00001107 cachekey = (safe, always_safe)
1108 try:
1109 safe_map = _safemaps[cachekey]
1110 except KeyError:
1111 safe += always_safe
1112 safe_map = {}
1113 for i in range(256):
1114 c = chr(i)
1115 safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1116 _safemaps[cachekey] = safe_map
1117 res = map(safe_map.__getitem__, s)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001118 return ''.join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001119
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001120def quote_plus(s, safe = ''):
1121 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001122 if ' ' in s:
Raymond Hettingercf6b6322005-09-10 18:17:54 +00001123 s = quote(s, safe + ' ')
1124 return s.replace(' ', '+')
1125 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001126
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001127def urlencode(query,doseq=0):
1128 """Encode a sequence of two-element tuples or dictionary into a URL query string.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001129
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001130 If any values in the query arg are sequences and doseq is true, each
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001131 sequence element is converted to a separate parameter.
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001132
1133 If the query arg is a sequence of two-element tuples, the order of the
1134 parameters in the output will match the order of parameters in the
1135 input.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001136 """
Tim Peters658cba62001-02-09 20:06:00 +00001137
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001138 if hasattr(query,"items"):
1139 # mapping objects
1140 query = query.items()
1141 else:
1142 # it's a bother at times that strings and string-like objects are
1143 # sequences...
1144 try:
1145 # non-sequence items should not work with len()
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001146 # non-empty strings will fail this
Walter Dörwald65230a22002-06-03 15:58:32 +00001147 if len(query) and not isinstance(query[0], tuple):
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001148 raise TypeError
1149 # zero-length sequences of all types will get here and succeed,
1150 # but that's a minor nit - since the original implementation
1151 # allowed empty dicts that type of behavior probably should be
1152 # preserved for consistency
1153 except TypeError:
1154 ty,va,tb = sys.exc_info()
1155 raise TypeError, "not a valid non-string sequence or mapping object", tb
1156
Guido van Rossume7b146f2000-02-04 15:28:42 +00001157 l = []
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001158 if not doseq:
1159 # preserve old behavior
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001160 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001161 k = quote_plus(str(k))
1162 v = quote_plus(str(v))
1163 l.append(k + '=' + v)
1164 else:
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001165 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001166 k = quote_plus(str(k))
Walter Dörwald65230a22002-06-03 15:58:32 +00001167 if isinstance(v, str):
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001168 v = quote_plus(v)
1169 l.append(k + '=' + v)
Guido van Rossum4b46c0a2002-05-24 17:58:05 +00001170 elif _is_unicode(v):
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001171 # is there a reasonable way to convert to ASCII?
1172 # encode generates a string, but "replace" or "ignore"
1173 # lose information and "strict" can raise UnicodeError
1174 v = quote_plus(v.encode("ASCII","replace"))
1175 l.append(k + '=' + v)
1176 else:
1177 try:
1178 # is this a sufficient test for sequence-ness?
1179 x = len(v)
1180 except TypeError:
1181 # not a sequence
1182 v = quote_plus(str(v))
1183 l.append(k + '=' + v)
1184 else:
1185 # loop over the sequence
1186 for elt in v:
1187 l.append(k + '=' + quote_plus(str(elt)))
Guido van Rossumb2493f82000-12-15 15:01:37 +00001188 return '&'.join(l)
Guido van Rossum810a3391998-07-22 21:33:23 +00001189
Guido van Rossum442e7201996-03-20 15:33:11 +00001190# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001191def getproxies_environment():
1192 """Return a dictionary of scheme -> proxy server URL mappings.
1193
1194 Scan the environment for variables named <scheme>_proxy;
1195 this seems to be the standard convention. If you need a
1196 different way, you can pass a proxies dictionary to the
1197 [Fancy]URLopener constructor.
1198
1199 """
1200 proxies = {}
1201 for name, value in os.environ.items():
Guido van Rossumb2493f82000-12-15 15:01:37 +00001202 name = name.lower()
Mark Hammond4f570b92000-07-26 07:04:38 +00001203 if value and name[-6:] == '_proxy':
1204 proxies[name[:-6]] = value
1205 return proxies
1206
Jack Jansen11d9b062004-07-16 11:45:00 +00001207if sys.platform == 'darwin':
1208 def getproxies_internetconfig():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001209 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001210
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001211 By convention the mac uses Internet Config to store
1212 proxies. An HTTP proxy, for instance, is stored under
1213 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001214
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001215 """
1216 try:
1217 import ic
1218 except ImportError:
1219 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001220
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001221 try:
1222 config = ic.IC()
1223 except ic.error:
1224 return {}
1225 proxies = {}
1226 # HTTP:
Raymond Hettinger54f02222002-06-01 14:18:47 +00001227 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001228 try:
1229 value = config['HTTPProxyHost']
1230 except ic.error:
1231 pass
1232 else:
1233 proxies['http'] = 'http://%s' % value
1234 # FTP: XXXX To be done.
1235 # Gopher: XXXX To be done.
1236 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001237
Tim Peters55c12d42001-08-09 18:04:14 +00001238 def proxy_bypass(x):
1239 return 0
1240
Jack Jansen11d9b062004-07-16 11:45:00 +00001241 def getproxies():
1242 return getproxies_environment() or getproxies_internetconfig()
Tim Peters182b5ac2004-07-18 06:16:08 +00001243
Mark Hammond4f570b92000-07-26 07:04:38 +00001244elif os.name == 'nt':
1245 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001246 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001247
1248 Win32 uses the registry to store proxies.
1249
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001250 """
1251 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001252 try:
1253 import _winreg
1254 except ImportError:
1255 # Std module, so should be around - but you never know!
1256 return proxies
1257 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001258 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1259 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001260 proxyEnable = _winreg.QueryValueEx(internetSettings,
1261 'ProxyEnable')[0]
1262 if proxyEnable:
1263 # Returned as Unicode but problems if not converted to ASCII
1264 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1265 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001266 if '=' in proxyServer:
1267 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001268 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001269 protocol, address = p.split('=', 1)
Guido van Rossumb955d6c2002-03-31 23:38:48 +00001270 # See if address has a type:// prefix
Guido van Rossum64e5aa92002-04-02 14:38:16 +00001271 import re
1272 if not re.match('^([^/:]+)://', address):
Guido van Rossumb955d6c2002-03-31 23:38:48 +00001273 address = '%s://%s' % (protocol, address)
1274 proxies[protocol] = address
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001275 else:
1276 # Use one setting for all protocols
1277 if proxyServer[:5] == 'http:':
1278 proxies['http'] = proxyServer
1279 else:
1280 proxies['http'] = 'http://%s' % proxyServer
1281 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001282 internetSettings.Close()
1283 except (WindowsError, ValueError, TypeError):
1284 # Either registry key not found etc, or the value in an
1285 # unexpected format.
1286 # proxies already set up to be empty so nothing to do
1287 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001288 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001289
Mark Hammond4f570b92000-07-26 07:04:38 +00001290 def getproxies():
1291 """Return a dictionary of scheme -> proxy server URL mappings.
1292
1293 Returns settings gathered from the environment, if specified,
1294 or the registry.
1295
1296 """
1297 return getproxies_environment() or getproxies_registry()
Tim Peters55c12d42001-08-09 18:04:14 +00001298
1299 def proxy_bypass(host):
1300 try:
1301 import _winreg
1302 import re
Tim Peters55c12d42001-08-09 18:04:14 +00001303 except ImportError:
1304 # Std modules, so should be around - but you never know!
1305 return 0
1306 try:
1307 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1308 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1309 proxyEnable = _winreg.QueryValueEx(internetSettings,
1310 'ProxyEnable')[0]
1311 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1312 'ProxyOverride')[0])
1313 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1314 except WindowsError:
1315 return 0
1316 if not proxyEnable or not proxyOverride:
1317 return 0
1318 # try to make a host list from name and IP address.
1319 host = [host]
1320 try:
1321 addr = socket.gethostbyname(host[0])
1322 if addr != host:
1323 host.append(addr)
1324 except socket.error:
1325 pass
1326 # make a check value list from the registry entry: replace the
1327 # '<local>' string by the localhost entry and the corresponding
1328 # canonical entry.
1329 proxyOverride = proxyOverride.split(';')
1330 i = 0
1331 while i < len(proxyOverride):
1332 if proxyOverride[i] == '<local>':
1333 proxyOverride[i:i+1] = ['localhost',
1334 '127.0.0.1',
1335 socket.gethostname(),
1336 socket.gethostbyname(
1337 socket.gethostname())]
1338 i += 1
1339 # print proxyOverride
1340 # now check if we match one of the registry values.
1341 for test in proxyOverride:
Tim Petersab9ba272001-08-09 21:40:30 +00001342 test = test.replace(".", r"\.") # mask dots
1343 test = test.replace("*", r".*") # change glob sequence
1344 test = test.replace("?", r".") # change glob char
Tim Peters55c12d42001-08-09 18:04:14 +00001345 for val in host:
1346 # print "%s <--> %s" %( test, val )
1347 if re.match(test, val, re.I):
1348 return 1
1349 return 0
1350
Mark Hammond4f570b92000-07-26 07:04:38 +00001351else:
1352 # By default use environment variables
1353 getproxies = getproxies_environment
1354
Tim Peters55c12d42001-08-09 18:04:14 +00001355 def proxy_bypass(host):
1356 return 0
Guido van Rossum442e7201996-03-20 15:33:11 +00001357
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001358# Test and time quote() and unquote()
1359def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001360 s = ''
1361 for i in range(256): s = s + chr(i)
1362 s = s*4
1363 t0 = time.time()
1364 qs = quote(s)
1365 uqs = unquote(qs)
1366 t1 = time.time()
1367 if uqs != s:
1368 print 'Wrong!'
Walter Dörwald70a6b492004-02-12 17:35:32 +00001369 print repr(s)
1370 print repr(qs)
1371 print repr(uqs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001372 print round(t1 - t0, 3), 'sec'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001373
1374
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001375def reporthook(blocknum, blocksize, totalsize):
1376 # Report during remote transfers
Guido van Rossumb2493f82000-12-15 15:01:37 +00001377 print "Block number: %d, Block size: %d, Total size: %d" % (
1378 blocknum, blocksize, totalsize)
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001379
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001380# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001381def test(args=[]):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001382 if not args:
1383 args = [
1384 '/etc/passwd',
1385 'file:/etc/passwd',
1386 'file://localhost/etc/passwd',
Andrew M. Kuchling56a42352002-03-18 22:18:46 +00001387 'ftp://ftp.python.org/pub/python/README',
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001388## 'gopher://gopher.micro.umn.edu/1/',
1389 'http://www.python.org/index.html',
1390 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001391 if hasattr(URLopener, "open_https"):
1392 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001393 try:
1394 for url in args:
1395 print '-'*10, url, '-'*10
1396 fn, h = urlretrieve(url, None, reporthook)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001397 print fn
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001398 if h:
1399 print '======'
1400 for k in h.keys(): print k + ':', h[k]
1401 print '======'
1402 fp = open(fn, 'rb')
1403 data = fp.read()
1404 del fp
1405 if '\r' in data:
1406 table = string.maketrans("", "")
Guido van Rossumb2493f82000-12-15 15:01:37 +00001407 data = data.translate(table, "\r")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001408 print data
1409 fn, h = None, None
1410 print '-'*40
1411 finally:
1412 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001413
Guido van Rossum23490151998-06-25 02:39:00 +00001414def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001415 import getopt, sys
1416 try:
1417 opts, args = getopt.getopt(sys.argv[1:], "th")
1418 except getopt.error, msg:
1419 print msg
1420 print "Use -h for help"
1421 return
1422 t = 0
1423 for o, a in opts:
1424 if o == '-t':
1425 t = t + 1
1426 if o == '-h':
1427 print "Usage: python urllib.py [-t] [url ...]"
1428 print "-t runs self-test;",
1429 print "otherwise, contents of urls are printed"
1430 return
1431 if t:
1432 if t > 1:
1433 test1()
1434 test(args)
1435 else:
1436 if not args:
1437 print "Use -h for help"
1438 for url in args:
1439 print urlopen(url).read(),
Guido van Rossum23490151998-06-25 02:39:00 +00001440
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001441# Run test program when run as a script
1442if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001443 main()