blob: 3ecfea7c0d04d52b33c24fb35c15cb37f9c56cc1 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000019and close() methods work like those of open files.
Guido van Rossume7b146f2000-02-04 15:28:42 +000020The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000024
Jeremy Hylton5f22af12007-08-16 17:55:18 +000025import httplib
Jack Jansendc3e3f61995-12-15 13:22:13 +000026import os
Jeremy Hylton5f22af12007-08-16 17:55:18 +000027import socket
Guido van Rossum3c8484e1996-11-20 22:02:24 +000028import sys
Jeremy Hylton5f22af12007-08-16 17:55:18 +000029import time
Brett Cannon69200fa2004-03-23 21:26:39 +000030from urlparse import urljoin as basejoin
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000031
Skip Montanaro40fc1602001-03-01 04:27:19 +000032__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
Skip Montanaro44d5e0c2001-03-13 19:47:16 +000034 "urlencode", "url2pathname", "pathname2url", "splittag",
35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37 "splitnport", "splitquery", "splitattr", "splitvalue",
Guido van Rossumd59da4b2007-05-22 18:11:13 +000038 "getproxies"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000039
Martin v. Löwis3e865952006-01-24 15:51:21 +000040__version__ = '1.17' # XXX This version is not always updated :-(
Guido van Rossumf668d171997-06-06 21:11:11 +000041
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000042MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
Guido van Rossum6cb15a01995-06-22 19:00:13 +000043
Jack Jansendc3e3f61995-12-15 13:22:13 +000044# Helper for non-unix systems
45if os.name == 'mac':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000046 from macurl2path import url2pathname, pathname2url
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +000047elif os.name == 'nt':
Fredrik Lundhb49f88b2000-09-24 18:51:25 +000048 from nturl2path import url2pathname, pathname2url
Jack Jansendc3e3f61995-12-15 13:22:13 +000049else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000050 def url2pathname(pathname):
Georg Brandlc0b24732005-12-26 22:53:56 +000051 """OS-specific conversion from a relative URL of the 'file' scheme
52 to a file system path; not recommended for general use."""
Guido van Rossum367ac801999-03-12 14:31:10 +000053 return unquote(pathname)
Georg Brandlc0b24732005-12-26 22:53:56 +000054
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000055 def pathname2url(pathname):
Georg Brandlc0b24732005-12-26 22:53:56 +000056 """OS-specific conversion from a file system path to a relative URL
57 of the 'file' scheme; not recommended for general use."""
Guido van Rossum367ac801999-03-12 14:31:10 +000058 return quote(pathname)
Guido van Rossum33add0a1998-12-18 15:25:22 +000059
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000060# This really consists of two pieces:
61# (1) a class which handles opening of all sorts of URLs
62# (plus assorted utilities etc.)
63# (2) a set of functions for parsing URLs
64# XXX Should these be separated out into different modules?
65
66
67# Shortcut for basic usage
68_urlopener = None
Fred Drakedf6eca72002-04-04 20:41:34 +000069def urlopen(url, data=None, proxies=None):
Skip Montanaro79f1c172000-08-22 03:00:52 +000070 """urlopen(url [, data]) -> open file-like object"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000071 global _urlopener
Fred Drakedf6eca72002-04-04 20:41:34 +000072 if proxies is not None:
73 opener = FancyURLopener(proxies=proxies)
74 elif not _urlopener:
75 opener = FancyURLopener()
76 _urlopener = opener
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000077 else:
Fred Drakedf6eca72002-04-04 20:41:34 +000078 opener = _urlopener
79 if data is None:
80 return opener.open(url)
81 else:
82 return opener.open(url, data)
Jeremy Hylton39b198d2007-08-04 19:22:00 +000083
Fred Drake316a7932000-08-24 01:01:26 +000084def urlretrieve(url, filename=None, reporthook=None, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000085 global _urlopener
86 if not _urlopener:
87 _urlopener = FancyURLopener()
Fred Drake316a7932000-08-24 01:01:26 +000088 return _urlopener.retrieve(url, filename, reporthook, data)
Jeremy Hylton39b198d2007-08-04 19:22:00 +000089
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000090def urlcleanup():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +000091 if _urlopener:
92 _urlopener.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000093
Georg Brandlb9256022005-08-24 18:46:39 +000094# exception raised when downloaded size does not match content-length
95class ContentTooShortError(IOError):
96 def __init__(self, message, content):
97 IOError.__init__(self, message)
98 self.content = content
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000099
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000100ftpcache = {}
101class URLopener:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000102 """Class to open URLs.
103 This is a class rather than just a subroutine because we may need
104 more than one set of global protocol-specific options.
105 Note -- this is a base class for those who don't want the
106 automatic handling of errors type 302 (relocated) and 401
107 (authorization needed)."""
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000108
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000109 __tempfiles = None
Guido van Rossum29e77811996-11-27 19:39:58 +0000110
Guido van Rossumba311382000-08-24 16:18:04 +0000111 version = "Python-urllib/%s" % __version__
112
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000113 # Constructor
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000114 def __init__(self, proxies=None, **x509):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000115 if proxies is None:
116 proxies = getproxies()
Guido van Rossume2b70bc2006-08-18 22:13:04 +0000117 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000118 self.proxies = proxies
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000119 self.key_file = x509.get('key_file')
120 self.cert_file = x509.get('cert_file')
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000121 self.addheaders = [('User-Agent', self.version)]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000122 self.__tempfiles = []
123 self.__unlink = os.unlink # See cleanup()
124 self.tempcache = None
125 # Undocumented feature: if you assign {} to tempcache,
126 # it is used to cache files retrieved with
127 # self.retrieve(). This is not enabled by default
128 # since it does not work for changing documents (and I
129 # haven't got the logic to check expiration headers
130 # yet).
131 self.ftpcache = ftpcache
132 # Undocumented feature: you can use a different
133 # ftp cache by assigning to the .ftpcache member;
134 # in case you want logically independent URL openers
135 # XXX This is not threadsafe. Bah.
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000136
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000137 def __del__(self):
138 self.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000139
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000140 def close(self):
141 self.cleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000142
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000143 def cleanup(self):
144 # This code sometimes runs when the rest of this module
145 # has already been deleted, so it can't use any globals
146 # or import anything.
147 if self.__tempfiles:
148 for file in self.__tempfiles:
149 try:
150 self.__unlink(file)
Martin v. Löwis58682b72001-08-11 15:02:57 +0000151 except OSError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000152 pass
153 del self.__tempfiles[:]
154 if self.tempcache:
155 self.tempcache.clear()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000156
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000157 def addheader(self, *args):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000158 """Add a header to be used by the HTTP interface only
159 e.g. u.addheader('Accept', 'sound/basic')"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000160 self.addheaders.append(args)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000161
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000162 # External interface
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000163 def open(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000164 """Use URLopener().open(file) instead of open(file, 'r')."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000165 fullurl = unwrap(toBytes(fullurl))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000166 if self.tempcache and fullurl in self.tempcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000167 filename, headers = self.tempcache[fullurl]
168 fp = open(filename, 'rb')
169 return addinfourl(fp, headers, fullurl)
Martin v. Löwis1d994332000-12-03 18:30:10 +0000170 urltype, url = splittype(fullurl)
171 if not urltype:
172 urltype = 'file'
Raymond Hettinger54f02222002-06-01 14:18:47 +0000173 if urltype in self.proxies:
Martin v. Löwis1d994332000-12-03 18:30:10 +0000174 proxy = self.proxies[urltype]
175 urltype, proxyhost = splittype(proxy)
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000176 host, selector = splithost(proxyhost)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000177 url = (host, fullurl) # Signal special case to open_*()
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000178 else:
179 proxy = None
Martin v. Löwis1d994332000-12-03 18:30:10 +0000180 name = 'open_' + urltype
181 self.type = urltype
Brett Cannonaaeffaf2004-03-23 23:50:17 +0000182 name = name.replace('-', '_')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000183 if not hasattr(self, name):
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000184 if proxy:
185 return self.open_unknown_proxy(proxy, fullurl, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000186 else:
187 return self.open_unknown(fullurl, data)
188 try:
189 if data is None:
190 return getattr(self, name)(url)
191 else:
192 return getattr(self, name)(url, data)
Guido van Rossumb940e112007-01-10 16:19:56 +0000193 except socket.error as msg:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000194 raise IOError, ('socket error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000195
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000196 def open_unknown(self, fullurl, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000197 """Overridable interface to open unknown URL type."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000198 type, url = splittype(fullurl)
199 raise IOError, ('url error', 'unknown url type', type)
Guido van Rossumca445401995-08-29 19:19:12 +0000200
Jeremy Hyltond52755f2000-10-02 23:04:02 +0000201 def open_unknown_proxy(self, proxy, fullurl, data=None):
202 """Overridable interface to open unknown URL type."""
203 type, url = splittype(fullurl)
204 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
205
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000206 # External interface
Sjoerd Mullenderd7b86f02000-08-25 11:23:36 +0000207 def retrieve(self, url, filename=None, reporthook=None, data=None):
Brett Cannon7d618c72003-04-24 02:43:20 +0000208 """retrieve(url) returns (filename, headers) for a local object
Guido van Rossume7b146f2000-02-04 15:28:42 +0000209 or (tempfilename, headers) for a remote object."""
Martin v. Löwis1d994332000-12-03 18:30:10 +0000210 url = unwrap(toBytes(url))
Raymond Hettinger54f02222002-06-01 14:18:47 +0000211 if self.tempcache and url in self.tempcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000212 return self.tempcache[url]
213 type, url1 = splittype(url)
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000214 if filename is None and (not type or type == 'file'):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000215 try:
216 fp = self.open_local_file(url1)
217 hdrs = fp.info()
218 del fp
219 return url2pathname(splithost(url1)[1]), hdrs
Guido van Rossumb940e112007-01-10 16:19:56 +0000220 except IOError as msg:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000221 pass
Fred Drake316a7932000-08-24 01:01:26 +0000222 fp = self.open(url, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000223 headers = fp.info()
Guido van Rossum3b0a3292002-08-09 16:38:32 +0000224 if filename:
225 tfp = open(filename, 'wb')
226 else:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000227 import tempfile
228 garbage, path = splittype(url)
229 garbage, path = splithost(path or "")
230 path, garbage = splitquery(path or "")
231 path, garbage = splitattr(path or "")
232 suffix = os.path.splitext(path)[1]
Guido van Rossum3b0a3292002-08-09 16:38:32 +0000233 (fd, filename) = tempfile.mkstemp(suffix)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000234 self.__tempfiles.append(filename)
Jeremy Hylton3bd6fde2002-10-11 14:36:24 +0000235 tfp = os.fdopen(fd, 'wb')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000236 result = filename, headers
237 if self.tempcache is not None:
238 self.tempcache[url] = result
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000239 bs = 1024*8
240 size = -1
Georg Brandlb9256022005-08-24 18:46:39 +0000241 read = 0
Georg Brandl5a650a22005-08-26 08:51:34 +0000242 blocknum = 0
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000243 if reporthook:
Raymond Hettinger54f02222002-06-01 14:18:47 +0000244 if "content-length" in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000245 size = int(headers["Content-Length"])
Georg Brandl5a650a22005-08-26 08:51:34 +0000246 reporthook(blocknum, bs, size)
247 while 1:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000248 block = fp.read(bs)
Guido van Rossuma0982942007-07-10 08:30:03 +0000249 if not block:
Georg Brandl5a650a22005-08-26 08:51:34 +0000250 break
Georg Brandlb9256022005-08-24 18:46:39 +0000251 read += len(block)
Georg Brandl5a650a22005-08-26 08:51:34 +0000252 tfp.write(block)
Georg Brandlb9256022005-08-24 18:46:39 +0000253 blocknum += 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000254 if reporthook:
255 reporthook(blocknum, bs, size)
256 fp.close()
257 tfp.close()
258 del fp
259 del tfp
Georg Brandlb9256022005-08-24 18:46:39 +0000260
261 # raise exception if actual size does not match content-length header
262 if size >= 0 and read < size:
263 raise ContentTooShortError("retrieval incomplete: got only %i out "
264 "of %i bytes" % (read, size), result)
265
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000266 return result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000267
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000268 # Each method named open_<type> knows how to open that type of URL
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000269
Jeremy Hylton5f22af12007-08-16 17:55:18 +0000270 def _open_generic_http(self, connection_factory, url, data):
271 """Make an HTTP connection using connection_class.
272
273 This is an internal method that should be called from
274 open_http() or open_https().
275
276 Arguments:
277 - connection_factory should take a host name and return an
278 HTTPConnection instance.
279 - url is the url to retrieval or a host, relative-path pair.
280 - data is payload for a POST request or None.
281 """
282
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000283 user_passwd = None
Martin v. Löwis3e865952006-01-24 15:51:21 +0000284 proxy_passwd= None
Walter Dörwald65230a22002-06-03 15:58:32 +0000285 if isinstance(url, str):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000286 host, selector = splithost(url)
287 if host:
288 user_passwd, host = splituser(host)
289 host = unquote(host)
290 realhost = host
291 else:
292 host, selector = url
Martin v. Löwis3e865952006-01-24 15:51:21 +0000293 # check whether the proxy contains authorization information
294 proxy_passwd, host = splituser(host)
295 # now we proceed with the url we want to obtain
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000296 urltype, rest = splittype(selector)
297 url = rest
298 user_passwd = None
Guido van Rossumb2493f82000-12-15 15:01:37 +0000299 if urltype.lower() != 'http':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000300 realhost = None
301 else:
302 realhost, rest = splithost(rest)
303 if realhost:
304 user_passwd, realhost = splituser(realhost)
305 if user_passwd:
306 selector = "%s://%s%s" % (urltype, realhost, rest)
Tim Peters55c12d42001-08-09 18:04:14 +0000307 if proxy_bypass(realhost):
308 host = realhost
309
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000310 #print "proxy via http:", host, selector
311 if not host: raise IOError, ('http error', 'no host given')
Tim Peters92037a12006-01-24 22:44:08 +0000312
Martin v. Löwis3e865952006-01-24 15:51:21 +0000313 if proxy_passwd:
314 import base64
Thomas Wouters89f507f2006-12-13 04:49:30 +0000315 proxy_auth = base64.b64encode(proxy_passwd).strip()
Martin v. Löwis3e865952006-01-24 15:51:21 +0000316 else:
317 proxy_auth = None
318
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000319 if user_passwd:
320 import base64
Thomas Wouters89f507f2006-12-13 04:49:30 +0000321 auth = base64.b64encode(user_passwd).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000322 else:
323 auth = None
Jeremy Hylton5f22af12007-08-16 17:55:18 +0000324 http_conn = connection_factory(host)
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000325 # XXX We should fix urllib so that it works with HTTP/1.1.
326 http_conn._http_vsn = 10
327 http_conn._http_vsn_str = "HTTP/1.0"
328
329 headers = {}
330 if proxy_auth:
331 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
332 if auth:
333 headers["Authorization"] = "Basic %s" % auth
334 if realhost:
335 headers["Host"] = realhost
336 for header, value in self.addheaders:
337 headers[header] = value
338
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000339 if data is not None:
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000340 headers["Content-Type"] = "application/x-www-form-urlencoded"
341 http_conn.request("POST", selector, data, headers)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000342 else:
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000343 http_conn.request("GET", selector, headers=headers)
344
345 try:
346 response = http_conn.getresponse()
347 except httplib.BadStatusLine:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000348 # something went wrong with the HTTP status line
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000349 raise IOError('http protocol error', 0,
350 'got a bad status line', None)
351
352 if response.status == 200:
353 return addinfourl(response.fp, response.msg, "http:" + url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000354 else:
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000355 return self.http_error(
356 url, response.fp,
357 response.status, response.reason, response.msg, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000358
Jeremy Hylton5f22af12007-08-16 17:55:18 +0000359 def open_http(self, url, data=None):
360 """Use HTTP protocol."""
361 return self._open_generic_http(httplib.HTTPConnection, url, data)
362
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000363 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000364 """Handle http errors.
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000365
Guido van Rossume7b146f2000-02-04 15:28:42 +0000366 Derived class can override this, or provide specific handlers
367 named http_error_DDD where DDD is the 3-digit error code."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000368 # First check if there's a specific handler for this error
369 name = 'http_error_%d' % errcode
370 if hasattr(self, name):
371 method = getattr(self, name)
372 if data is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000373 result = method(url, fp, errcode, errmsg, headers)
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000374 else:
375 result = method(url, fp, errcode, errmsg, headers, data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000376 if result: return result
Jeremy Hyltonb30f52a1999-02-25 16:14:58 +0000377 return self.http_error_default(url, fp, errcode, errmsg, headers)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000378
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000379 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000380 """Default error handler: close the connection and raise IOError."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000381 void = fp.read()
382 fp.close()
383 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000384
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000385 if hasattr(socket, "ssl"):
Jeremy Hylton5f22af12007-08-16 17:55:18 +0000386 def _https_connection(self, host):
387 return httplib.HTTPSConnection(host,
388 key_file=self.key_file,
389 cert_file=self.cert_file)
390
Andrew M. Kuchling141e9892000-04-23 02:53:11 +0000391 def open_https(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000392 """Use HTTPS protocol."""
Jeremy Hylton5f22af12007-08-16 17:55:18 +0000393 return self._open_generic_http(self._https_connection, url, data)
Fred Drake567ca8e2000-08-21 21:42:42 +0000394
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000395 def open_file(self, url):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000396 """Use local file or FTP depending on form of URL."""
Martin v. Löwis3e865952006-01-24 15:51:21 +0000397 if not isinstance(url, str):
398 raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
Jack Jansen4ef11032002-09-12 20:14:04 +0000399 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000400 return self.open_ftp(url)
401 else:
402 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000403
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000404 def open_local_file(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000405 """Use local file."""
Thomas Woutersb2137042007-02-01 18:02:27 +0000406 import mimetypes, mimetools, email.utils
Guido van Rossum68937b42007-05-18 00:51:22 +0000407 from io import StringIO
Guido van Rossumf0713d32001-08-09 17:43:35 +0000408 host, file = splithost(url)
409 localname = url2pathname(file)
Guido van Rossuma2da3052002-04-15 00:25:01 +0000410 try:
411 stats = os.stat(localname)
Guido van Rossumb940e112007-01-10 16:19:56 +0000412 except OSError as e:
Guido van Rossuma2da3052002-04-15 00:25:01 +0000413 raise IOError(e.errno, e.strerror, e.filename)
Walter Dörwald92b48b72002-03-22 17:30:38 +0000414 size = stats.st_size
Thomas Woutersb2137042007-02-01 18:02:27 +0000415 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000416 mtype = mimetypes.guess_type(url)[0]
Raymond Hettingera6172712004-12-31 19:15:26 +0000417 headers = mimetools.Message(StringIO(
Guido van Rossumf0713d32001-08-09 17:43:35 +0000418 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
419 (mtype or 'text/plain', size, modified)))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000420 if not host:
Guido van Rossum336a2011999-06-24 15:27:36 +0000421 urlfile = file
422 if file[:1] == '/':
423 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000424 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000425 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000426 host, port = splitport(host)
427 if not port \
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000428 and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum336a2011999-06-24 15:27:36 +0000429 urlfile = file
430 if file[:1] == '/':
431 urlfile = 'file://' + file
Guido van Rossumf0713d32001-08-09 17:43:35 +0000432 return addinfourl(open(localname, 'rb'),
Guido van Rossum336a2011999-06-24 15:27:36 +0000433 headers, urlfile)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000434 raise IOError, ('local file error', 'not on local host')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000435
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000436 def open_ftp(self, url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000437 """Use FTP protocol."""
Martin v. Löwis3e865952006-01-24 15:51:21 +0000438 if not isinstance(url, str):
439 raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
Raymond Hettingera6172712004-12-31 19:15:26 +0000440 import mimetypes, mimetools
Guido van Rossum68937b42007-05-18 00:51:22 +0000441 from io import StringIO
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000442 host, path = splithost(url)
443 if not host: raise IOError, ('ftp error', 'no host given')
444 host, port = splitport(host)
445 user, host = splituser(host)
446 if user: user, passwd = splitpasswd(user)
447 else: passwd = None
448 host = unquote(host)
449 user = unquote(user or '')
450 passwd = unquote(passwd or '')
451 host = socket.gethostbyname(host)
452 if not port:
453 import ftplib
454 port = ftplib.FTP_PORT
455 else:
456 port = int(port)
457 path, attrs = splitattr(path)
458 path = unquote(path)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000459 dirs = path.split('/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000460 dirs, file = dirs[:-1], dirs[-1]
461 if dirs and not dirs[0]: dirs = dirs[1:]
Guido van Rossum5e006a31999-08-18 17:40:33 +0000462 if dirs and not dirs[0]: dirs[0] = '/'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000463 key = user, host, port, '/'.join(dirs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000464 # XXX thread unsafe!
465 if len(self.ftpcache) > MAXFTPCACHE:
466 # Prune the cache, rather arbitrarily
467 for k in self.ftpcache.keys():
468 if k != key:
469 v = self.ftpcache[k]
470 del self.ftpcache[k]
471 v.close()
472 try:
Raymond Hettinger54f02222002-06-01 14:18:47 +0000473 if not key in self.ftpcache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000474 self.ftpcache[key] = \
475 ftpwrapper(user, passwd, host, port, dirs)
476 if not file: type = 'D'
477 else: type = 'I'
478 for attr in attrs:
479 attr, value = splitvalue(attr)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000480 if attr.lower() == 'type' and \
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000481 value in ('a', 'A', 'i', 'I', 'd', 'D'):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000482 type = value.upper()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000483 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000484 mtype = mimetypes.guess_type("ftp:" + url)[0]
485 headers = ""
486 if mtype:
487 headers += "Content-Type: %s\n" % mtype
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000488 if retrlen is not None and retrlen >= 0:
Guido van Rossum88e0b5b2001-08-23 13:38:15 +0000489 headers += "Content-Length: %d\n" % retrlen
Raymond Hettingera6172712004-12-31 19:15:26 +0000490 headers = mimetools.Message(StringIO(headers))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000491 return addinfourl(fp, headers, "ftp:" + url)
Guido van Rossumb940e112007-01-10 16:19:56 +0000492 except ftperrors() as msg:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000493 raise IOError, ('ftp error', msg), sys.exc_info()[2]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000494
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000495 def open_data(self, url, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000496 """Use "data" URL."""
Martin v. Löwis3e865952006-01-24 15:51:21 +0000497 if not isinstance(url, str):
498 raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000499 # ignore POSTed data
500 #
501 # syntax of data URLs:
502 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
503 # mediatype := [ type "/" subtype ] *( ";" parameter )
504 # data := *urlchar
505 # parameter := attribute "=" value
Raymond Hettingera6172712004-12-31 19:15:26 +0000506 import mimetools
Guido van Rossum68937b42007-05-18 00:51:22 +0000507 from io import StringIO
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000508 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000509 [type, data] = url.split(',', 1)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000510 except ValueError:
511 raise IOError, ('data error', 'bad data URL')
512 if not type:
513 type = 'text/plain;charset=US-ASCII'
Guido van Rossumb2493f82000-12-15 15:01:37 +0000514 semi = type.rfind(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000515 if semi >= 0 and '=' not in type[semi:]:
516 encoding = type[semi+1:]
517 type = type[:semi]
518 else:
519 encoding = ''
520 msg = []
521 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
522 time.gmtime(time.time())))
523 msg.append('Content-type: %s' % type)
524 if encoding == 'base64':
525 import base64
526 data = base64.decodestring(data)
527 else:
528 data = unquote(data)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000529 msg.append('Content-Length: %d' % len(data))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000530 msg.append('')
531 msg.append(data)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000532 msg = '\n'.join(msg)
Raymond Hettingera6172712004-12-31 19:15:26 +0000533 f = StringIO(msg)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000534 headers = mimetools.Message(f, 0)
Georg Brandl1f663572005-11-26 16:50:44 +0000535 #f.fileno = None # needed for addinfourl
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000536 return addinfourl(f, headers, url)
Guido van Rossum6d4d1c21998-03-12 14:32:55 +0000537
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000538
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000539class FancyURLopener(URLopener):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000540 """Derived class with handlers for errors we can handle (perhaps)."""
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000541
Neal Norwitz60e04cd2002-06-11 13:38:51 +0000542 def __init__(self, *args, **kwargs):
Guido van Rossum68468eb2003-02-27 20:14:51 +0000543 URLopener.__init__(self, *args, **kwargs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000544 self.auth_cache = {}
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000545 self.tries = 0
546 self.maxtries = 10
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000547
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000548 def http_error_default(self, url, fp, errcode, errmsg, headers):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000549 """Default error handling -- don't raise an exception."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000550 return addinfourl(fp, headers, "http:" + url)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000551
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000552 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000553 """Error 302 -- relocated (temporarily)."""
Skip Montanaroc3e11d62001-02-15 16:56:36 +0000554 self.tries += 1
555 if self.maxtries and self.tries >= self.maxtries:
556 if hasattr(self, "http_error_500"):
557 meth = self.http_error_500
558 else:
559 meth = self.http_error_default
560 self.tries = 0
561 return meth(url, fp, 500,
562 "Internal Server Error: Redirect Recursion", headers)
563 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
564 data)
565 self.tries = 0
566 return result
567
568 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
Raymond Hettinger54f02222002-06-01 14:18:47 +0000569 if 'location' in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000570 newurl = headers['location']
Raymond Hettinger54f02222002-06-01 14:18:47 +0000571 elif 'uri' in headers:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000572 newurl = headers['uri']
573 else:
574 return
575 void = fp.read()
576 fp.close()
Guido van Rossum3527f591999-03-29 20:23:41 +0000577 # In case the server sent a relative URL, join with original:
Moshe Zadka5d87d472001-04-09 14:54:21 +0000578 newurl = basejoin(self.type + ":" + url, newurl)
Guido van Rossumfa19f7c2003-05-16 01:46:51 +0000579 return self.open(newurl)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000580
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000581 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000582 """Error 301 -- also relocated (permanently)."""
583 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
Guido van Rossume6ad8911996-09-10 17:02:56 +0000584
Raymond Hettinger024aaa12003-04-24 15:32:12 +0000585 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
586 """Error 303 -- also relocated (essentially identical to 302)."""
587 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
588
Guido van Rossumfa19f7c2003-05-16 01:46:51 +0000589 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
590 """Error 307 -- relocated, but turn POST into error."""
591 if data is None:
592 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
593 else:
594 return self.http_error_default(url, fp, errcode, errmsg, headers)
595
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000596 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000597 """Error 401 -- authentication required.
Martin v. Löwis3e865952006-01-24 15:51:21 +0000598 This function supports Basic authentication only."""
Raymond Hettinger54f02222002-06-01 14:18:47 +0000599 if not 'www-authenticate' in headers:
Tim Peters85ba6732001-02-28 08:26:44 +0000600 URLopener.http_error_default(self, url, fp,
Fred Drakec680ae82001-10-13 18:37:07 +0000601 errcode, errmsg, headers)
Moshe Zadkae99bd172001-02-27 06:27:04 +0000602 stuff = headers['www-authenticate']
603 import re
604 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
605 if not match:
Tim Peters85ba6732001-02-28 08:26:44 +0000606 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000607 errcode, errmsg, headers)
608 scheme, realm = match.groups()
609 if scheme.lower() != 'basic':
Tim Peters85ba6732001-02-28 08:26:44 +0000610 URLopener.http_error_default(self, url, fp,
Moshe Zadkae99bd172001-02-27 06:27:04 +0000611 errcode, errmsg, headers)
612 name = 'retry_' + self.type + '_basic_auth'
613 if data is None:
614 return getattr(self,name)(url, realm)
615 else:
616 return getattr(self,name)(url, realm, data)
Tim Peters92037a12006-01-24 22:44:08 +0000617
Martin v. Löwis3e865952006-01-24 15:51:21 +0000618 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
619 """Error 407 -- proxy authentication required.
620 This function supports Basic authentication only."""
621 if not 'proxy-authenticate' in headers:
622 URLopener.http_error_default(self, url, fp,
623 errcode, errmsg, headers)
624 stuff = headers['proxy-authenticate']
625 import re
626 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
627 if not match:
628 URLopener.http_error_default(self, url, fp,
629 errcode, errmsg, headers)
630 scheme, realm = match.groups()
631 if scheme.lower() != 'basic':
632 URLopener.http_error_default(self, url, fp,
633 errcode, errmsg, headers)
634 name = 'retry_proxy_' + self.type + '_basic_auth'
635 if data is None:
636 return getattr(self,name)(url, realm)
637 else:
638 return getattr(self,name)(url, realm, data)
Tim Peters92037a12006-01-24 22:44:08 +0000639
Martin v. Löwis3e865952006-01-24 15:51:21 +0000640 def retry_proxy_http_basic_auth(self, url, realm, data=None):
641 host, selector = splithost(url)
642 newurl = 'http://' + host + selector
643 proxy = self.proxies['http']
644 urltype, proxyhost = splittype(proxy)
645 proxyhost, proxyselector = splithost(proxyhost)
646 i = proxyhost.find('@') + 1
647 proxyhost = proxyhost[i:]
648 user, passwd = self.get_user_passwd(proxyhost, realm, i)
649 if not (user or passwd): return None
650 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
651 self.proxies['http'] = 'http://' + proxyhost + proxyselector
652 if data is None:
653 return self.open(newurl)
654 else:
655 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000656
Martin v. Löwis3e865952006-01-24 15:51:21 +0000657 def retry_proxy_https_basic_auth(self, url, realm, data=None):
658 host, selector = splithost(url)
659 newurl = 'https://' + host + selector
660 proxy = self.proxies['https']
661 urltype, proxyhost = splittype(proxy)
662 proxyhost, proxyselector = splithost(proxyhost)
663 i = proxyhost.find('@') + 1
664 proxyhost = proxyhost[i:]
665 user, passwd = self.get_user_passwd(proxyhost, realm, i)
666 if not (user or passwd): return None
667 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
668 self.proxies['https'] = 'https://' + proxyhost + proxyselector
669 if data is None:
670 return self.open(newurl)
671 else:
672 return self.open(newurl, data)
Tim Peters92037a12006-01-24 22:44:08 +0000673
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000674 def retry_http_basic_auth(self, url, realm, data=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000675 host, selector = splithost(url)
Guido van Rossumb2493f82000-12-15 15:01:37 +0000676 i = host.find('@') + 1
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000677 host = host[i:]
678 user, passwd = self.get_user_passwd(host, realm, i)
679 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000680 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000681 newurl = 'http://' + host + selector
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000682 if data is None:
683 return self.open(newurl)
684 else:
685 return self.open(newurl, data)
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000686
Guido van Rossum3c8baed2000-02-01 23:36:55 +0000687 def retry_https_basic_auth(self, url, realm, data=None):
Tim Peterse1190062001-01-15 03:34:38 +0000688 host, selector = splithost(url)
689 i = host.find('@') + 1
690 host = host[i:]
691 user, passwd = self.get_user_passwd(host, realm, i)
692 if not (user or passwd): return None
Guido van Rossumafc4f042001-01-15 18:31:13 +0000693 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
Martin v. Löwis3e865952006-01-24 15:51:21 +0000694 newurl = 'https://' + host + selector
695 if data is None:
696 return self.open(newurl)
697 else:
698 return self.open(newurl, data)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000699
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000700 def get_user_passwd(self, host, realm, clear_cache = 0):
Guido van Rossumb2493f82000-12-15 15:01:37 +0000701 key = realm + '@' + host.lower()
Raymond Hettinger54f02222002-06-01 14:18:47 +0000702 if key in self.auth_cache:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000703 if clear_cache:
704 del self.auth_cache[key]
705 else:
706 return self.auth_cache[key]
707 user, passwd = self.prompt_user_passwd(host, realm)
708 if user or passwd: self.auth_cache[key] = (user, passwd)
709 return user, passwd
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000710
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000711 def prompt_user_passwd(self, host, realm):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000712 """Override this in a GUI environment!"""
Guido van Rossum7cba8502007-03-19 22:23:59 +0000713 import getpass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000714 try:
Guido van Rossum7cba8502007-03-19 22:23:59 +0000715 user = input("Enter username for %s at %s: " % (realm, host))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000716 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
717 (user, realm, host))
718 return user, passwd
719 except KeyboardInterrupt:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000720 print()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000721 return None, None
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000722
723
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000724# Utility functions
725
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000726_localhost = None
727def localhost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000728 """Return the IP address of the magic hostname 'localhost'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000729 global _localhost
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000730 if _localhost is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000731 _localhost = socket.gethostbyname('localhost')
732 return _localhost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000733
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000734_thishost = None
735def thishost():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000736 """Return the IP address of the current host."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000737 global _thishost
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000738 if _thishost is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000739 _thishost = socket.gethostbyname(socket.gethostname())
740 return _thishost
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000741
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000742_ftperrors = None
743def ftperrors():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000744 """Return the set of errors raised by the FTP class."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000745 global _ftperrors
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000746 if _ftperrors is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000747 import ftplib
748 _ftperrors = ftplib.all_errors
749 return _ftperrors
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000750
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000751_noheaders = None
752def noheaders():
Guido van Rossume7b146f2000-02-04 15:28:42 +0000753 """Return an empty mimetools.Message object."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000754 global _noheaders
Raymond Hettinger10ff7062002-06-02 03:04:52 +0000755 if _noheaders is None:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000756 import mimetools
Guido van Rossum68937b42007-05-18 00:51:22 +0000757 from io import StringIO
Raymond Hettingera6172712004-12-31 19:15:26 +0000758 _noheaders = mimetools.Message(StringIO(), 0)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000759 _noheaders.fp.close() # Recycle file descriptor
760 return _noheaders
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000761
762
763# Utility classes
764
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000765class ftpwrapper:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000766 """Class used by open_ftp() for cache of open FTP connections."""
767
Guido van Rossume7ba4952007-06-06 23:52:48 +0000768 def __init__(self, user, passwd, host, port, dirs, timeout=None):
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000769 self.user = user
770 self.passwd = passwd
771 self.host = host
772 self.port = port
773 self.dirs = dirs
Guido van Rossume7ba4952007-06-06 23:52:48 +0000774 self.timeout = timeout
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000775 self.init()
Guido van Rossume7b146f2000-02-04 15:28:42 +0000776
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000777 def init(self):
778 import ftplib
779 self.busy = 0
780 self.ftp = ftplib.FTP()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000781 self.ftp.connect(self.host, self.port, self.timeout)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000782 self.ftp.login(self.user, self.passwd)
783 for dir in self.dirs:
784 self.ftp.cwd(dir)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000785
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000786 def retrfile(self, file, type):
787 import ftplib
788 self.endtransfer()
789 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
790 else: cmd = 'TYPE ' + type; isdir = 0
791 try:
792 self.ftp.voidcmd(cmd)
793 except ftplib.all_errors:
794 self.init()
795 self.ftp.voidcmd(cmd)
796 conn = None
797 if file and not isdir:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000798 # Try to retrieve as a file
799 try:
800 cmd = 'RETR ' + file
801 conn = self.ftp.ntransfercmd(cmd)
Guido van Rossumb940e112007-01-10 16:19:56 +0000802 except ftplib.error_perm as reason:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000803 if str(reason)[:3] != '550':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000804 raise IOError, ('ftp error', reason), sys.exc_info()[2]
805 if not conn:
806 # Set transfer mode to ASCII!
807 self.ftp.voidcmd('TYPE A')
808 # Try a directory listing
809 if file: cmd = 'LIST ' + file
810 else: cmd = 'LIST'
811 conn = self.ftp.ntransfercmd(cmd)
812 self.busy = 1
813 # Pass back both a suitably decorated object and a retrieval length
814 return (addclosehook(conn[0].makefile('rb'),
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000815 self.endtransfer), conn[1])
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000816 def endtransfer(self):
817 if not self.busy:
818 return
819 self.busy = 0
820 try:
821 self.ftp.voidresp()
822 except ftperrors():
823 pass
Guido van Rossume7b146f2000-02-04 15:28:42 +0000824
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000825 def close(self):
826 self.endtransfer()
827 try:
828 self.ftp.close()
829 except ftperrors():
830 pass
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000831
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000832class addbase:
Guido van Rossume7b146f2000-02-04 15:28:42 +0000833 """Base class for addinfo and addclosehook."""
834
Jeremy Hylton39b198d2007-08-04 19:22:00 +0000835 # XXX Add a method to expose the timeout on the underlying socket?
836
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000837 def __init__(self, fp):
838 self.fp = fp
839 self.read = self.fp.read
840 self.readline = self.fp.readline
Guido van Rossum09c8b6c1999-12-07 21:37:17 +0000841 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
Georg Brandl1f663572005-11-26 16:50:44 +0000842 if hasattr(self.fp, "fileno"):
843 self.fileno = self.fp.fileno
844 else:
845 self.fileno = lambda: None
Raymond Hettinger42182eb2003-03-09 05:33:33 +0000846 if hasattr(self.fp, "__iter__"):
847 self.__iter__ = self.fp.__iter__
Georg Brandla18af4e2007-04-21 15:47:16 +0000848 if hasattr(self.fp, "__next__"):
849 self.__next__ = self.fp.__next__
Guido van Rossume7b146f2000-02-04 15:28:42 +0000850
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000851 def __repr__(self):
Walter Dörwald70a6b492004-02-12 17:35:32 +0000852 return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
853 id(self), self.fp)
Guido van Rossume7b146f2000-02-04 15:28:42 +0000854
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000855 def close(self):
856 self.read = None
857 self.readline = None
858 self.readlines = None
859 self.fileno = None
860 if self.fp: self.fp.close()
861 self.fp = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000862
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000863class addclosehook(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000864 """Class to add a close hook to an open file."""
865
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000866 def __init__(self, fp, closehook, *hookargs):
867 addbase.__init__(self, fp)
868 self.closehook = closehook
869 self.hookargs = hookargs
Guido van Rossume7b146f2000-02-04 15:28:42 +0000870
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000871 def close(self):
Guido van Rossumc580dae2000-05-24 13:21:46 +0000872 addbase.close(self)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000873 if self.closehook:
Guido van Rossum68468eb2003-02-27 20:14:51 +0000874 self.closehook(*self.hookargs)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000875 self.closehook = None
876 self.hookargs = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000877
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000878class addinfo(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000879 """class to add an info() method to an open file."""
880
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000881 def __init__(self, fp, headers):
882 addbase.__init__(self, fp)
883 self.headers = headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000884
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000885 def info(self):
886 return self.headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000887
Guido van Rossume6ad8911996-09-10 17:02:56 +0000888class addinfourl(addbase):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000889 """class to add info() and geturl() methods to an open file."""
890
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000891 def __init__(self, fp, headers, url):
892 addbase.__init__(self, fp)
893 self.headers = headers
894 self.url = url
Guido van Rossume7b146f2000-02-04 15:28:42 +0000895
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000896 def info(self):
897 return self.headers
Guido van Rossume7b146f2000-02-04 15:28:42 +0000898
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000899 def geturl(self):
900 return self.url
Guido van Rossume6ad8911996-09-10 17:02:56 +0000901
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000902
Guido van Rossum7c395db1994-07-04 22:14:49 +0000903# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000904# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000905# splittype('type:opaquestring') --> 'type', 'opaquestring'
906# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000907# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
908# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000909# splitport('host:port') --> 'host', 'port'
910# splitquery('/path?query') --> '/path', 'query'
911# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000912# splitattr('/path;attr1=value1;attr2=value2;...') ->
913# '/path', ['attr1=value1', 'attr2=value2', ...]
914# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000915# unquote('abc%20def') -> 'abc def'
916# quote('abc def') -> 'abc%20def')
917
Walter Dörwald65230a22002-06-03 15:58:32 +0000918try:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000919 str
Walter Dörwald65230a22002-06-03 15:58:32 +0000920except NameError:
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000921 def _is_unicode(x):
922 return 0
Walter Dörwald65230a22002-06-03 15:58:32 +0000923else:
924 def _is_unicode(x):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000925 return isinstance(x, str)
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000926
Martin v. Löwis1d994332000-12-03 18:30:10 +0000927def toBytes(url):
928 """toBytes(u"URL") --> 'URL'."""
929 # Most URL schemes require ASCII. If that changes, the conversion
930 # can be relaxed
Guido van Rossum4b46c0a2002-05-24 17:58:05 +0000931 if _is_unicode(url):
Martin v. Löwis1d994332000-12-03 18:30:10 +0000932 try:
933 url = url.encode("ASCII")
934 except UnicodeError:
Guido van Rossumb2493f82000-12-15 15:01:37 +0000935 raise UnicodeError("URL " + repr(url) +
936 " contains non-ASCII characters")
Martin v. Löwis1d994332000-12-03 18:30:10 +0000937 return url
938
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000939def unwrap(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000940 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
Guido van Rossuma0982942007-07-10 08:30:03 +0000941 url = str(url).strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000942 if url[:1] == '<' and url[-1:] == '>':
Guido van Rossumb2493f82000-12-15 15:01:37 +0000943 url = url[1:-1].strip()
944 if url[:4] == 'URL:': url = url[4:].strip()
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000945 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000946
Guido van Rossum332e1441997-09-29 23:23:46 +0000947_typeprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000948def splittype(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000949 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000950 global _typeprog
951 if _typeprog is None:
952 import re
953 _typeprog = re.compile('^([^/:]+):')
Guido van Rossum332e1441997-09-29 23:23:46 +0000954
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000955 match = _typeprog.match(url)
956 if match:
957 scheme = match.group(1)
Fred Drake9e94afd2000-07-01 07:03:30 +0000958 return scheme.lower(), url[len(scheme) + 1:]
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000959 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000960
Guido van Rossum332e1441997-09-29 23:23:46 +0000961_hostprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000962def splithost(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000963 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000964 global _hostprog
965 if _hostprog is None:
966 import re
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000967 _hostprog = re.compile('^//([^/?]*)(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000968
Fredrik Lundhb49f88b2000-09-24 18:51:25 +0000969 match = _hostprog.match(url)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000970 if match: return match.group(1, 2)
971 return None, url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000972
Guido van Rossum332e1441997-09-29 23:23:46 +0000973_userprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000974def splituser(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000975 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000976 global _userprog
977 if _userprog is None:
978 import re
Raymond Hettingerf2e45dd2002-08-18 20:08:56 +0000979 _userprog = re.compile('^(.*)@(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000980
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000981 match = _userprog.match(host)
Fred Drake567ca8e2000-08-21 21:42:42 +0000982 if match: return map(unquote, match.group(1, 2))
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000983 return None, host
Guido van Rossum7c395db1994-07-04 22:14:49 +0000984
Guido van Rossum332e1441997-09-29 23:23:46 +0000985_passwdprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000986def splitpasswd(user):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000987 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000988 global _passwdprog
989 if _passwdprog is None:
990 import re
991 _passwdprog = re.compile('^([^:]*):(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +0000992
Jeremy Hyltonf90b0021999-02-25 16:12:12 +0000993 match = _passwdprog.match(user)
994 if match: return match.group(1, 2)
995 return user, None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000996
Guido van Rossume7b146f2000-02-04 15:28:42 +0000997# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum332e1441997-09-29 23:23:46 +0000998_portprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000999def splitport(host):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001000 """splitport('host:port') --> 'host', 'port'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001001 global _portprog
1002 if _portprog is None:
1003 import re
1004 _portprog = re.compile('^(.*):([0-9]+)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001005
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001006 match = _portprog.match(host)
1007 if match: return match.group(1, 2)
1008 return host, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001009
Guido van Rossum332e1441997-09-29 23:23:46 +00001010_nportprog = None
Guido van Rossum53725a21996-06-13 19:12:35 +00001011def splitnport(host, defport=-1):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001012 """Split host and port, returning numeric port.
1013 Return given default port if no ':' found; defaults to -1.
1014 Return numerical port if a valid number are found after ':'.
1015 Return None if ':' but not a valid number."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001016 global _nportprog
1017 if _nportprog is None:
1018 import re
1019 _nportprog = re.compile('^(.*):(.*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001020
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001021 match = _nportprog.match(host)
1022 if match:
1023 host, port = match.group(1, 2)
1024 try:
Guido van Rossumb2493f82000-12-15 15:01:37 +00001025 if not port: raise ValueError, "no digits"
1026 nport = int(port)
1027 except ValueError:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001028 nport = None
1029 return host, nport
1030 return host, defport
Guido van Rossum53725a21996-06-13 19:12:35 +00001031
Guido van Rossum332e1441997-09-29 23:23:46 +00001032_queryprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001033def splitquery(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001034 """splitquery('/path?query') --> '/path', 'query'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001035 global _queryprog
1036 if _queryprog is None:
1037 import re
1038 _queryprog = re.compile('^(.*)\?([^?]*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001039
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001040 match = _queryprog.match(url)
1041 if match: return match.group(1, 2)
1042 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001043
Guido van Rossum332e1441997-09-29 23:23:46 +00001044_tagprog = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001045def splittag(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001046 """splittag('/path#tag') --> '/path', 'tag'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001047 global _tagprog
1048 if _tagprog is None:
1049 import re
1050 _tagprog = re.compile('^(.*)#([^#]*)$')
Guido van Rossum7e7ca0b1998-03-26 21:01:39 +00001051
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001052 match = _tagprog.match(url)
1053 if match: return match.group(1, 2)
1054 return url, None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001055
Guido van Rossum7c395db1994-07-04 22:14:49 +00001056def splitattr(url):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001057 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1058 '/path', ['attr1=value1', 'attr2=value2', ...]."""
Guido van Rossumb2493f82000-12-15 15:01:37 +00001059 words = url.split(';')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001060 return words[0], words[1:]
Guido van Rossum7c395db1994-07-04 22:14:49 +00001061
Guido van Rossum332e1441997-09-29 23:23:46 +00001062_valueprog = None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001063def splitvalue(attr):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001064 """splitvalue('attr=value') --> 'attr', 'value'."""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001065 global _valueprog
1066 if _valueprog is None:
1067 import re
1068 _valueprog = re.compile('^([^=]*)=(.*)$')
Guido van Rossum332e1441997-09-29 23:23:46 +00001069
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001070 match = _valueprog.match(attr)
1071 if match: return match.group(1, 2)
1072 return attr, None
Guido van Rossum7c395db1994-07-04 22:14:49 +00001073
Raymond Hettinger803ce802005-09-10 06:49:04 +00001074_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1075_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1076
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001077def unquote(s):
Guido van Rossume7b146f2000-02-04 15:28:42 +00001078 """unquote('abc%20def') -> 'abc def'."""
Raymond Hettinger803ce802005-09-10 06:49:04 +00001079 res = s.split('%')
Guido van Rossum805365e2007-05-07 22:24:25 +00001080 for i in range(1, len(res)):
Raymond Hettinger803ce802005-09-10 06:49:04 +00001081 item = res[i]
1082 try:
1083 res[i] = _hextochr[item[:2]] + item[2:]
1084 except KeyError:
1085 res[i] = '%' + item
Raymond Hettinger4b0f20d2005-10-15 16:41:53 +00001086 except UnicodeDecodeError:
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001087 res[i] = chr(int(item[:2], 16)) + item[2:]
Guido van Rossumb2493f82000-12-15 15:01:37 +00001088 return "".join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001089
Guido van Rossum0564e121996-12-13 14:47:36 +00001090def unquote_plus(s):
Skip Montanaro79f1c172000-08-22 03:00:52 +00001091 """unquote('%7e/abc+def') -> '~/abc def'"""
Brett Cannonaaeffaf2004-03-23 23:50:17 +00001092 s = s.replace('+', ' ')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001093 return unquote(s)
Guido van Rossum0564e121996-12-13 14:47:36 +00001094
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001095always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton6102e292000-08-31 15:48:10 +00001096 'abcdefghijklmnopqrstuvwxyz'
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001097 '0123456789' '_.-')
Guido van Rossumc0f2d2d2007-08-03 19:19:24 +00001098_safe_quoters= {}
1099
1100class Quoter:
1101 def __init__(self, safe):
1102 self.cache = {}
1103 self.safe = safe + always_safe
1104
1105 def __call__(self, c):
1106 try:
1107 return self.cache[c]
1108 except KeyError:
1109 if ord(c) < 256:
1110 res = (c in self.safe) and c or ('%%%02X' % ord(c))
1111 self.cache[c] = res
1112 return res
1113 else:
1114 return "".join(['%%%02X' % i for i in c.encode("utf-8")])
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001115
Guido van Rossum7c395db1994-07-04 22:14:49 +00001116def quote(s, safe = '/'):
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001117 """quote('abc def') -> 'abc%20def'
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001118
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001119 Each part of a URL, e.g. the path info, the query, etc., has a
1120 different set of reserved characters that must be quoted.
1121
1122 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1123 the following reserved characters.
1124
1125 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1126 "$" | ","
1127
1128 Each of these characters is reserved in some component of a URL,
1129 but not necessarily in all of them.
1130
1131 By default, the quote function is intended for quoting the path
1132 section of a URL. Thus, it will not encode '/'. This character
1133 is reserved, but in typical usage the quote function is being
1134 called on a path where the existing slash characters are used as
1135 reserved characters.
1136 """
Raymond Hettinger199d2f72005-09-09 22:27:13 +00001137 cachekey = (safe, always_safe)
1138 try:
Guido van Rossumc0f2d2d2007-08-03 19:19:24 +00001139 quoter = _safe_quoters[cachekey]
Raymond Hettinger199d2f72005-09-09 22:27:13 +00001140 except KeyError:
Guido van Rossumc0f2d2d2007-08-03 19:19:24 +00001141 quoter = Quoter(safe)
1142 _safe_quoters[cachekey] = quoter
1143 res = map(quoter, s)
Guido van Rossumb2493f82000-12-15 15:01:37 +00001144 return ''.join(res)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001145
Jeremy Hylton7ae51bf2000-09-14 16:59:07 +00001146def quote_plus(s, safe = ''):
1147 """Quote the query fragment of a URL; replacing ' ' with '+'"""
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001148 if ' ' in s:
Raymond Hettingercf6b6322005-09-10 18:17:54 +00001149 s = quote(s, safe + ' ')
1150 return s.replace(' ', '+')
1151 return quote(s, safe)
Guido van Rossum0564e121996-12-13 14:47:36 +00001152
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001153def urlencode(query,doseq=0):
1154 """Encode a sequence of two-element tuples or dictionary into a URL query string.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001155
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001156 If any values in the query arg are sequences and doseq is true, each
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001157 sequence element is converted to a separate parameter.
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001158
1159 If the query arg is a sequence of two-element tuples, the order of the
1160 parameters in the output will match the order of parameters in the
1161 input.
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001162 """
Tim Peters658cba62001-02-09 20:06:00 +00001163
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001164 if hasattr(query,"items"):
1165 # mapping objects
1166 query = query.items()
1167 else:
1168 # it's a bother at times that strings and string-like objects are
1169 # sequences...
1170 try:
1171 # non-sequence items should not work with len()
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001172 # non-empty strings will fail this
Walter Dörwald65230a22002-06-03 15:58:32 +00001173 if len(query) and not isinstance(query[0], tuple):
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001174 raise TypeError
1175 # zero-length sequences of all types will get here and succeed,
1176 # but that's a minor nit - since the original implementation
1177 # allowed empty dicts that type of behavior probably should be
1178 # preserved for consistency
1179 except TypeError:
1180 ty,va,tb = sys.exc_info()
1181 raise TypeError, "not a valid non-string sequence or mapping object", tb
1182
Guido van Rossume7b146f2000-02-04 15:28:42 +00001183 l = []
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001184 if not doseq:
1185 # preserve old behavior
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001186 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001187 k = quote_plus(str(k))
1188 v = quote_plus(str(v))
1189 l.append(k + '=' + v)
1190 else:
Skip Montanaro14f1ad42001-01-28 21:11:12 +00001191 for k, v in query:
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001192 k = quote_plus(str(k))
Walter Dörwald65230a22002-06-03 15:58:32 +00001193 if isinstance(v, str):
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001194 v = quote_plus(v)
1195 l.append(k + '=' + v)
Guido van Rossum4b46c0a2002-05-24 17:58:05 +00001196 elif _is_unicode(v):
Skip Montanaroa5d23a12001-01-20 15:56:39 +00001197 # is there a reasonable way to convert to ASCII?
1198 # encode generates a string, but "replace" or "ignore"
1199 # lose information and "strict" can raise UnicodeError
1200 v = quote_plus(v.encode("ASCII","replace"))
1201 l.append(k + '=' + v)
1202 else:
1203 try:
1204 # is this a sufficient test for sequence-ness?
1205 x = len(v)
1206 except TypeError:
1207 # not a sequence
1208 v = quote_plus(str(v))
1209 l.append(k + '=' + v)
1210 else:
1211 # loop over the sequence
1212 for elt in v:
1213 l.append(k + '=' + quote_plus(str(elt)))
Guido van Rossumb2493f82000-12-15 15:01:37 +00001214 return '&'.join(l)
Guido van Rossum810a3391998-07-22 21:33:23 +00001215
Guido van Rossum442e7201996-03-20 15:33:11 +00001216# Proxy handling
Mark Hammond4f570b92000-07-26 07:04:38 +00001217def getproxies_environment():
1218 """Return a dictionary of scheme -> proxy server URL mappings.
1219
1220 Scan the environment for variables named <scheme>_proxy;
1221 this seems to be the standard convention. If you need a
1222 different way, you can pass a proxies dictionary to the
1223 [Fancy]URLopener constructor.
1224
1225 """
1226 proxies = {}
1227 for name, value in os.environ.items():
Guido van Rossumb2493f82000-12-15 15:01:37 +00001228 name = name.lower()
Mark Hammond4f570b92000-07-26 07:04:38 +00001229 if value and name[-6:] == '_proxy':
1230 proxies[name[:-6]] = value
1231 return proxies
1232
Jack Jansen11d9b062004-07-16 11:45:00 +00001233if sys.platform == 'darwin':
1234 def getproxies_internetconfig():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001235 """Return a dictionary of scheme -> proxy server URL mappings.
Guido van Rossum442e7201996-03-20 15:33:11 +00001236
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001237 By convention the mac uses Internet Config to store
1238 proxies. An HTTP proxy, for instance, is stored under
1239 the HttpProxy key.
Guido van Rossum442e7201996-03-20 15:33:11 +00001240
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001241 """
1242 try:
1243 import ic
1244 except ImportError:
1245 return {}
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001246
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001247 try:
1248 config = ic.IC()
1249 except ic.error:
1250 return {}
1251 proxies = {}
1252 # HTTP:
Raymond Hettinger54f02222002-06-01 14:18:47 +00001253 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001254 try:
1255 value = config['HTTPProxyHost']
1256 except ic.error:
1257 pass
1258 else:
1259 proxies['http'] = 'http://%s' % value
1260 # FTP: XXXX To be done.
1261 # Gopher: XXXX To be done.
1262 return proxies
Mark Hammond4f570b92000-07-26 07:04:38 +00001263
Tim Peters55c12d42001-08-09 18:04:14 +00001264 def proxy_bypass(x):
1265 return 0
1266
Jack Jansen11d9b062004-07-16 11:45:00 +00001267 def getproxies():
1268 return getproxies_environment() or getproxies_internetconfig()
Tim Peters182b5ac2004-07-18 06:16:08 +00001269
Mark Hammond4f570b92000-07-26 07:04:38 +00001270elif os.name == 'nt':
1271 def getproxies_registry():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001272 """Return a dictionary of scheme -> proxy server URL mappings.
Mark Hammond4f570b92000-07-26 07:04:38 +00001273
1274 Win32 uses the registry to store proxies.
1275
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001276 """
1277 proxies = {}
Mark Hammond4f570b92000-07-26 07:04:38 +00001278 try:
1279 import _winreg
1280 except ImportError:
1281 # Std module, so should be around - but you never know!
1282 return proxies
1283 try:
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001284 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1285 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Mark Hammond4f570b92000-07-26 07:04:38 +00001286 proxyEnable = _winreg.QueryValueEx(internetSettings,
1287 'ProxyEnable')[0]
1288 if proxyEnable:
1289 # Returned as Unicode but problems if not converted to ASCII
1290 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1291 'ProxyServer')[0])
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001292 if '=' in proxyServer:
1293 # Per-protocol settings
Mark Hammond4f570b92000-07-26 07:04:38 +00001294 for p in proxyServer.split(';'):
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001295 protocol, address = p.split('=', 1)
Guido van Rossumb955d6c2002-03-31 23:38:48 +00001296 # See if address has a type:// prefix
Guido van Rossum64e5aa92002-04-02 14:38:16 +00001297 import re
1298 if not re.match('^([^/:]+)://', address):
Guido van Rossumb955d6c2002-03-31 23:38:48 +00001299 address = '%s://%s' % (protocol, address)
1300 proxies[protocol] = address
Fredrik Lundhb49f88b2000-09-24 18:51:25 +00001301 else:
1302 # Use one setting for all protocols
1303 if proxyServer[:5] == 'http:':
1304 proxies['http'] = proxyServer
1305 else:
1306 proxies['http'] = 'http://%s' % proxyServer
1307 proxies['ftp'] = 'ftp://%s' % proxyServer
Mark Hammond4f570b92000-07-26 07:04:38 +00001308 internetSettings.Close()
1309 except (WindowsError, ValueError, TypeError):
1310 # Either registry key not found etc, or the value in an
1311 # unexpected format.
1312 # proxies already set up to be empty so nothing to do
1313 pass
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001314 return proxies
Guido van Rossum442e7201996-03-20 15:33:11 +00001315
Mark Hammond4f570b92000-07-26 07:04:38 +00001316 def getproxies():
1317 """Return a dictionary of scheme -> proxy server URL mappings.
1318
1319 Returns settings gathered from the environment, if specified,
1320 or the registry.
1321
1322 """
1323 return getproxies_environment() or getproxies_registry()
Tim Peters55c12d42001-08-09 18:04:14 +00001324
1325 def proxy_bypass(host):
1326 try:
1327 import _winreg
1328 import re
Tim Peters55c12d42001-08-09 18:04:14 +00001329 except ImportError:
1330 # Std modules, so should be around - but you never know!
1331 return 0
1332 try:
1333 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1334 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1335 proxyEnable = _winreg.QueryValueEx(internetSettings,
1336 'ProxyEnable')[0]
1337 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1338 'ProxyOverride')[0])
1339 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1340 except WindowsError:
1341 return 0
1342 if not proxyEnable or not proxyOverride:
1343 return 0
1344 # try to make a host list from name and IP address.
Georg Brandl1f636702006-02-18 23:10:23 +00001345 rawHost, port = splitport(host)
1346 host = [rawHost]
Tim Peters55c12d42001-08-09 18:04:14 +00001347 try:
Georg Brandl1f636702006-02-18 23:10:23 +00001348 addr = socket.gethostbyname(rawHost)
1349 if addr != rawHost:
Tim Peters55c12d42001-08-09 18:04:14 +00001350 host.append(addr)
1351 except socket.error:
1352 pass
Georg Brandl1f636702006-02-18 23:10:23 +00001353 try:
1354 fqdn = socket.getfqdn(rawHost)
1355 if fqdn != rawHost:
1356 host.append(fqdn)
1357 except socket.error:
1358 pass
Tim Peters55c12d42001-08-09 18:04:14 +00001359 # make a check value list from the registry entry: replace the
1360 # '<local>' string by the localhost entry and the corresponding
1361 # canonical entry.
1362 proxyOverride = proxyOverride.split(';')
1363 i = 0
1364 while i < len(proxyOverride):
1365 if proxyOverride[i] == '<local>':
1366 proxyOverride[i:i+1] = ['localhost',
1367 '127.0.0.1',
1368 socket.gethostname(),
1369 socket.gethostbyname(
1370 socket.gethostname())]
1371 i += 1
1372 # print proxyOverride
1373 # now check if we match one of the registry values.
1374 for test in proxyOverride:
Tim Petersab9ba272001-08-09 21:40:30 +00001375 test = test.replace(".", r"\.") # mask dots
1376 test = test.replace("*", r".*") # change glob sequence
1377 test = test.replace("?", r".") # change glob char
Tim Peters55c12d42001-08-09 18:04:14 +00001378 for val in host:
1379 # print "%s <--> %s" %( test, val )
1380 if re.match(test, val, re.I):
1381 return 1
1382 return 0
1383
Mark Hammond4f570b92000-07-26 07:04:38 +00001384else:
1385 # By default use environment variables
1386 getproxies = getproxies_environment
1387
Tim Peters55c12d42001-08-09 18:04:14 +00001388 def proxy_bypass(host):
1389 return 0
Guido van Rossum442e7201996-03-20 15:33:11 +00001390
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001391# Test and time quote() and unquote()
1392def test1():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001393 s = ''
1394 for i in range(256): s = s + chr(i)
1395 s = s*4
1396 t0 = time.time()
1397 qs = quote(s)
1398 uqs = unquote(qs)
1399 t1 = time.time()
1400 if uqs != s:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001401 print('Wrong!')
1402 print(repr(s))
1403 print(repr(qs))
1404 print(repr(uqs))
1405 print(round(t1 - t0, 3), 'sec')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001406
1407
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001408def reporthook(blocknum, blocksize, totalsize):
1409 # Report during remote transfers
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001410 print("Block number: %d, Block size: %d, Total size: %d" % (
1411 blocknum, blocksize, totalsize))
Guido van Rossum9ab96d41998-09-28 14:07:00 +00001412
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001413# Test program
Guido van Rossum23490151998-06-25 02:39:00 +00001414def test(args=[]):
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001415 import string
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001416 if not args:
1417 args = [
1418 '/etc/passwd',
1419 'file:/etc/passwd',
1420 'file://localhost/etc/passwd',
Guido van Rossumd8faa362007-04-27 19:54:29 +00001421 'ftp://ftp.gnu.org/pub/README',
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001422 'http://www.python.org/index.html',
1423 ]
Guido van Rossum09c8b6c1999-12-07 21:37:17 +00001424 if hasattr(URLopener, "open_https"):
1425 args.append('https://synergy.as.cmu.edu/~geek/')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001426 try:
1427 for url in args:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001428 print('-'*10, url, '-'*10)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001429 fn, h = urlretrieve(url, None, reporthook)
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001430 print(fn)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001431 if h:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001432 print('======')
1433 for k in h.keys(): print(k + ':', h[k])
1434 print('======')
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001435 fp = open(fn, 'rb')
1436 data = fp.read()
1437 del fp
1438 if '\r' in data:
1439 table = string.maketrans("", "")
Guido van Rossumb2493f82000-12-15 15:01:37 +00001440 data = data.translate(table, "\r")
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001441 print(data)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001442 fn, h = None, None
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001443 print('-'*40)
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001444 finally:
1445 urlcleanup()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001446
Guido van Rossum23490151998-06-25 02:39:00 +00001447def main():
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001448 import getopt, sys
1449 try:
1450 opts, args = getopt.getopt(sys.argv[1:], "th")
Guido van Rossumb940e112007-01-10 16:19:56 +00001451 except getopt.error as msg:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001452 print(msg)
1453 print("Use -h for help")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001454 return
1455 t = 0
1456 for o, a in opts:
1457 if o == '-t':
1458 t = t + 1
1459 if o == '-h':
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001460 print("Usage: python urllib.py [-t] [url ...]")
1461 print("-t runs self-test;", end=' ')
1462 print("otherwise, contents of urls are printed")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001463 return
1464 if t:
1465 if t > 1:
1466 test1()
1467 test(args)
1468 else:
1469 if not args:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001470 print("Use -h for help")
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001471 for url in args:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001472 print(urlopen(url).read(), end=' ')
Guido van Rossum23490151998-06-25 02:39:00 +00001473
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001474# Run test program when run as a script
1475if __name__ == '__main__':
Jeremy Hyltonf90b0021999-02-25 16:12:12 +00001476 main()