blob: 39780d983de30a993e9568e12044b2278301e5e1 [file] [log] [blame]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001# Open an arbitrary URL
2#
3# See the following document for a tentative description of URLs:
4# Uniform Resource Locators Tim Berners-Lee
5# INTERNET DRAFT CERN
6# IETF URL Working Group 14 July 1993
7# draft-ietf-uri-url-01.txt
8#
9# The object returned by URLopener().open(file) will differ per
10# protocol. All you know is that is has methods read(), readline(),
11# readlines(), fileno(), close() and info(). The read*(), fileno()
12# and close() methods work like those of open files.
Guido van Rossumbbb0a051995-08-04 04:29:05 +000013# The info() method returns an mimetools.Message object which can be
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000014# used to query various info about the object, if available.
Guido van Rossumbbb0a051995-08-04 04:29:05 +000015# (mimetools.Message objects are queried with the getheader() method.)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000016
Guido van Rossum7c395db1994-07-04 22:14:49 +000017import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000018import socket
19import regex
20
21
Guido van Rossumca445401995-08-29 19:19:12 +000022__version__ = '1.2'
Guido van Rossum6cb15a01995-06-22 19:00:13 +000023
24
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000025# This really consists of two pieces:
26# (1) a class which handles opening of all sorts of URLs
27# (plus assorted utilities etc.)
28# (2) a set of functions for parsing URLs
29# XXX Should these be separated out into different modules?
30
31
32# Shortcut for basic usage
33_urlopener = None
34def urlopen(url):
35 global _urlopener
36 if not _urlopener:
Guido van Rossumbbb0a051995-08-04 04:29:05 +000037 _urlopener = FancyURLopener()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000038 return _urlopener.open(url)
39def urlretrieve(url):
40 global _urlopener
41 if not _urlopener:
Guido van Rossumbbb0a051995-08-04 04:29:05 +000042 _urlopener = FancyURLopener()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000043 return _urlopener.retrieve(url)
44def urlcleanup():
45 if _urlopener:
46 _urlopener.cleanup()
47
48
49# Class to open URLs.
50# This is a class rather than just a subroutine because we may need
51# more than one set of global protocol-specific options.
Guido van Rossumbbb0a051995-08-04 04:29:05 +000052# Note -- this is a base class for those who don't want the
53# automatic handling of errors type 302 (relocated) and 401
54# (authorization needed).
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000055ftpcache = {}
56class URLopener:
57
58 # Constructor
59 def __init__(self):
Guido van Rossum6cb15a01995-06-22 19:00:13 +000060 server_version = "Python-urllib/%s" % __version__
61 self.addheaders = [('User-agent', server_version)]
Guido van Rossum7aeb4b91994-08-23 13:32:20 +000062 self.tempcache = None
63 # Undocumented feature: if you assign {} to tempcache,
64 # it is used to cache files retrieved with
65 # self.retrieve(). This is not enabled by default
66 # since it does not work for changing documents (and I
67 # haven't got the logic to check expiration headers
68 # yet).
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000069 self.ftpcache = ftpcache
70 # Undocumented feature: you can use a different
71 # ftp cache by assigning to the .ftpcache member;
72 # in case you want logically independent URL openers
73
74 def __del__(self):
75 self.close()
76
77 def close(self):
78 self.cleanup()
79
80 def cleanup(self):
81 import os
Guido van Rossum7aeb4b91994-08-23 13:32:20 +000082 if self.tempcache:
83 for url in self.tempcache.keys():
84 try:
85 os.unlink(self.tempcache[url][0])
86 except os.error:
87 pass
88 del self.tempcache[url]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000089
90 # Add a header to be used by the HTTP interface only
91 # e.g. u.addheader('Accept', 'sound/basic')
92 def addheader(self, *args):
93 self.addheaders.append(args)
94
95 # External interface
96 # Use URLopener().open(file) instead of open(file, 'r')
Guido van Rossumca445401995-08-29 19:19:12 +000097 def open(self, fullurl):
98 fullurl = unwrap(fullurl)
99 type, url = splittype(fullurl)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000100 if not type: type = 'file'
101 name = 'open_' + type
102 if '-' in name:
103 import regsub
104 name = regsub.gsub('-', '_', name)
105 if not hasattr(self, name):
Guido van Rossumca445401995-08-29 19:19:12 +0000106 return self.open_unknown(fullurl)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000107 try:
108 return getattr(self, name)(url)
109 except socket.error, msg:
110 raise IOError, ('socket error', msg)
111
Guido van Rossumca445401995-08-29 19:19:12 +0000112 # Overridable interface to open unknown URL type
113 def open_unknown(self, fullurl):
114 type, url = splittype(fullurl)
115 raise IOError, ('url error', 'unknown url type', type)
116
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000117 # External interface
118 # retrieve(url) returns (filename, None) for a local object
119 # or (tempfilename, headers) for a remote object
120 def retrieve(self, url):
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000121 if self.tempcache and self.tempcache.has_key(url):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000122 return self.tempcache[url]
123 url1 = unwrap(url)
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000124 if self.tempcache and self.tempcache.has_key(url1):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000125 self.tempcache[url] = self.tempcache[url1]
126 return self.tempcache[url1]
127 type, url1 = splittype(url1)
128 if not type or type == 'file':
129 try:
130 fp = self.open_local_file(url1)
131 del fp
132 return splithost(url1)[1], None
133 except IOError, msg:
134 pass
135 fp = self.open(url)
136 headers = fp.info()
137 import tempfile
138 tfn = tempfile.mktemp()
Guido van Rossumfa59e831994-09-21 11:36:19 +0000139 result = tfn, headers
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000140 if self.tempcache is not None:
Guido van Rossumfa59e831994-09-21 11:36:19 +0000141 self.tempcache[url] = result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000142 tfp = open(tfn, 'w')
143 bs = 1024*8
144 block = fp.read(bs)
145 while block:
146 tfp.write(block)
147 block = fp.read(bs)
148 del fp
149 del tfp
150 return result
151
152 # Each method named open_<type> knows how to open that type of URL
153
154 # Use HTTP protocol
155 def open_http(self, url):
156 import httplib
157 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000158 if not host: raise IOError, ('http error', 'no host given')
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000159 i = string.find(host, '@')
160 if i >= 0:
161 user_passwd, host = host[:i], host[i+1:]
162 else:
163 user_passwd = None
164 if user_passwd:
165 import base64
166 auth = string.strip(base64.encodestring(user_passwd))
167 else:
168 auth = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000169 h = httplib.HTTP(host)
170 h.putrequest('GET', selector)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000171 if auth: h.putheader('Authorization: Basic %s' % auth)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000172 for args in self.addheaders: apply(h.putheader, args)
Guido van Rossum6cb15a01995-06-22 19:00:13 +0000173 h.endheaders()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000174 errcode, errmsg, headers = h.getreply()
Guido van Rossum6cb15a01995-06-22 19:00:13 +0000175 fp = h.getfile()
176 if errcode == 200:
177 return addinfo(fp, headers)
178 else:
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000179 return self.http_error(url,
180 fp, errcode, errmsg, headers)
181
182 # Handle http errors.
183 # Derived class can override this, or provide specific handlers
184 # named http_error_DDD where DDD is the 3-digit error code
185 def http_error(self, url, fp, errcode, errmsg, headers):
186 # First check if there's a specific handler for this error
187 name = 'http_error_%d' % errcode
188 if hasattr(self, name):
189 method = getattr(self, name)
190 result = method(url, fp, errcode, errmsg, headers)
191 if result: return result
192 return self.http_error_default(
193 url, fp, errcode, errmsg, headers)
194
195 # Default http error handler: close the connection and raises IOError
196 def http_error_default(self, url, fp, errcode, errmsg, headers):
197 void = fp.read()
198 fp.close()
199 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000200
201 # Use Gopher protocol
202 def open_gopher(self, url):
203 import gopherlib
204 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000205 if not host: raise IOError, ('gopher error', 'no host given')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000206 type, selector = splitgophertype(selector)
207 selector, query = splitquery(selector)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000208 selector = unquote(selector)
209 if query:
210 query = unquote(query)
211 fp = gopherlib.send_query(selector, query, host)
212 else:
213 fp = gopherlib.send_selector(selector, host)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000214 return addinfo(fp, noheaders())
215
216 # Use local file or FTP depending on form of URL
217 def open_file(self, url):
Guido van Rossumca445401995-08-29 19:19:12 +0000218 if url[:2] == '//':
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000219 return self.open_ftp(url)
Guido van Rossumca445401995-08-29 19:19:12 +0000220 else:
221 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000222
223 # Use local file
224 def open_local_file(self, url):
225 host, file = splithost(url)
226 if not host: return addinfo(open(file, 'r'), noheaders())
227 host, port = splitport(host)
228 if not port and socket.gethostbyname(host) in (
229 localhost(), thishost()):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000230 file = unquote(file)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000231 return addinfo(open(file, 'r'), noheaders())
232 raise IOError, ('local file error', 'not on local host')
233
234 # Use FTP protocol
235 def open_ftp(self, url):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000236 host, path = splithost(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000237 if not host: raise IOError, ('ftp error', 'no host given')
238 host, port = splitport(host)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000239 user, host = splituser(host)
240 if user: user, passwd = splitpasswd(user)
241 else: passwd = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000242 host = socket.gethostbyname(host)
243 if not port:
244 import ftplib
245 port = ftplib.FTP_PORT
Guido van Rossum7c395db1994-07-04 22:14:49 +0000246 path, attrs = splitattr(path)
247 dirs = string.splitfields(path, '/')
248 dirs, file = dirs[:-1], dirs[-1]
249 if dirs and not dirs[0]: dirs = dirs[1:]
250 key = (user, host, port, string.joinfields(dirs, '/'))
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000251 try:
252 if not self.ftpcache.has_key(key):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000253 self.ftpcache[key] = \
254 ftpwrapper(user, passwd,
255 host, port, dirs)
256 if not file: type = 'D'
257 else: type = 'I'
258 for attr in attrs:
259 attr, value = splitvalue(attr)
260 if string.lower(attr) == 'type' and \
261 value in ('a', 'A', 'i', 'I', 'd', 'D'):
262 type = string.upper(value)
263 return addinfo(self.ftpcache[key].retrfile(file, type),
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000264 noheaders())
265 except ftperrors(), msg:
266 raise IOError, ('ftp error', msg)
267
268
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000269# Derived class with handlers for errors we can handle (perhaps)
270class FancyURLopener(URLopener):
271
272 def __init__(self, *args):
273 apply(URLopener.__init__, (self,) + args)
274 self.auth_cache = {}
275
276 # Default error handling -- don't raise an exception
277 def http_error_default(self, url, fp, errcode, errmsg, headers):
278 return addinfo(fp, headers)
279
280 # Error 302 -- relocated
281 def http_error_302(self, url, fp, errcode, errmsg, headers):
282 # XXX The server can force infinite recursion here!
283 if headers.has_key('location'):
284 newurl = headers['location']
285 elif headers.has_key('uri'):
286 newurl = headers['uri']
287 else:
288 return
289 void = fp.read()
290 fp.close()
291 return self.open(newurl)
292
293 # Error 401 -- authentication required
294 # See this URL for a description of the basic authentication scheme:
295 # http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt
296 def http_error_401(self, url, fp, errcode, errmsg, headers):
297 if headers.has_key('www-authenticate'):
298 stuff = headers['www-authenticate']
299 p = regex.compile(
300 '[ \t]*\([^ \t]+\)[ \t]+realm="\([^"]*\)"')
301 if p.match(stuff) >= 0:
302 scheme, realm = p.group(1, 2)
303 if string.lower(scheme) == 'basic':
304 return self.retry_http_basic_auth(
305 url, realm)
306
307 def retry_http_basic_auth(self, url, realm):
308 host, selector = splithost(url)
309 i = string.find(host, '@') + 1
310 host = host[i:]
311 user, passwd = self.get_user_passwd(host, realm, i)
312 if not (user or passwd): return None
313 host = user + ':' + passwd + '@' + host
314 newurl = '//' + host + selector
315 return self.open_http(newurl)
316
317 def get_user_passwd(self, host, realm, clear_cache = 0):
318 key = realm + '@' + string.lower(host)
319 if self.auth_cache.has_key(key):
320 if clear_cache:
321 del self.auth_cache[key]
322 else:
323 return self.auth_cache[key]
324 user, passwd = self.prompt_user_passwd(host, realm)
325 if user or passwd: self.auth_cache[key] = (user, passwd)
326 return user, passwd
327
328 def prompt_user_passwd(self, host, realm):
329 # Override this in a GUI environment!
330 try:
331 user = raw_input("Enter username for %s at %s: " %
332 (realm, host))
333 self.echo_off()
334 try:
335 passwd = raw_input(
336 "Enter password for %s in %s at %s: " %
337 (user, realm, host))
338 finally:
339 self.echo_on()
340 return user, passwd
341 except KeyboardInterrupt:
342 return None, None
343
344 def echo_off(self):
345 import os
346 os.system("stty -echo")
347
348 def echo_on(self):
349 import os
350 print
351 os.system("stty echo")
352
353
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000354# Utility functions
355
356# Return the IP address of the magic hostname 'localhost'
357_localhost = None
358def localhost():
359 global _localhost
360 if not _localhost:
361 _localhost = socket.gethostbyname('localhost')
362 return _localhost
363
364# Return the IP address of the current host
365_thishost = None
366def thishost():
367 global _thishost
368 if not _thishost:
369 _thishost = socket.gethostbyname(socket.gethostname())
370 return _thishost
371
372# Return the set of errors raised by the FTP class
373_ftperrors = None
374def ftperrors():
375 global _ftperrors
376 if not _ftperrors:
377 import ftplib
378 _ftperrors = (ftplib.error_reply,
379 ftplib.error_temp,
380 ftplib.error_perm,
381 ftplib.error_proto)
382 return _ftperrors
383
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000384# Return an empty mimetools.Message object
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000385_noheaders = None
386def noheaders():
387 global _noheaders
388 if not _noheaders:
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000389 import mimetools
390 import StringIO
391 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000392 _noheaders.fp.close() # Recycle file descriptor
393 return _noheaders
394
395
396# Utility classes
397
398# Class used by open_ftp() for cache of open FTP connections
399class ftpwrapper:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000400 def __init__(self, user, passwd, host, port, dirs):
401 self.user = unquote(user or '')
402 self.passwd = unquote(passwd or '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000403 self.host = host
404 self.port = port
Guido van Rossum7c395db1994-07-04 22:14:49 +0000405 self.dirs = []
406 for dir in dirs:
407 self.dirs.append(unquote(dir))
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000408 self.init()
409 def init(self):
410 import ftplib
411 self.ftp = ftplib.FTP()
412 self.ftp.connect(self.host, self.port)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000413 self.ftp.login(self.user, self.passwd)
414 for dir in self.dirs:
415 self.ftp.cwd(dir)
416 def retrfile(self, file, type):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000417 import ftplib
Guido van Rossum7c395db1994-07-04 22:14:49 +0000418 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
419 else: cmd = 'TYPE ' + type; isdir = 0
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000420 try:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000421 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000422 except ftplib.all_errors:
423 self.init()
Guido van Rossum7c395db1994-07-04 22:14:49 +0000424 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000425 conn = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000426 if file and not isdir:
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000427 try:
428 cmd = 'RETR ' + file
429 conn = self.ftp.transfercmd(cmd)
430 except ftplib.error_perm, reason:
431 if reason[:3] != '550':
432 raise IOError, ('ftp error', reason)
433 if not conn:
434 # Try a directory listing
435 if file: cmd = 'LIST ' + file
436 else: cmd = 'LIST'
437 conn = self.ftp.transfercmd(cmd)
438 return addclosehook(conn.makefile('r'), self.ftp.voidresp)
439
440# Base class for addinfo and addclosehook
441class addbase:
442 def __init__(self, fp):
443 self.fp = fp
444 self.read = self.fp.read
445 self.readline = self.fp.readline
446 self.readlines = self.fp.readlines
447 self.fileno = self.fp.fileno
448 def __repr__(self):
449 return '<%s at %s whose fp = %s>' % (
450 self.__class__.__name__, `id(self)`, `self.fp`)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000451 def close(self):
452 self.read = None
453 self.readline = None
454 self.readlines = None
455 self.fileno = None
Guido van Rossum6cb15a01995-06-22 19:00:13 +0000456 if self.fp: self.fp.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000457 self.fp = None
458
459# Class to add a close hook to an open file
460class addclosehook(addbase):
461 def __init__(self, fp, closehook, *hookargs):
462 addbase.__init__(self, fp)
463 self.closehook = closehook
464 self.hookargs = hookargs
465 def close(self):
466 if self.closehook:
467 apply(self.closehook, self.hookargs)
468 self.closehook = None
469 self.hookargs = None
470 addbase.close(self)
471
472# class to add an info() method to an open file
473class addinfo(addbase):
474 def __init__(self, fp, headers):
475 addbase.__init__(self, fp)
476 self.headers = headers
477 def info(self):
478 return self.headers
479
480
481# Utility to combine a URL with a base URL to form a new URL
482
483def basejoin(base, url):
484 type, path = splittype(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000485 host, path = splithost(path)
Guido van Rossuma1124701994-12-30 17:18:59 +0000486 if type and host: return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000487 basetype, basepath = splittype(base)
488 basehost, basepath = splithost(basepath)
489 basepath, basetag = splittag(basepath)
490 basepath, basequery = splitquery(basepath)
Guido van Rossuma1124701994-12-30 17:18:59 +0000491 if not type: type = basetype or 'file'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000492 if path[:1] != '/':
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000493 i = string.rfind(basepath, '/')
494 if i < 0: basepath = '/'
495 else: basepath = basepath[:i+1]
496 path = basepath + path
497 if not host: host = basehost
498 if host: return type + '://' + host + path
499 else: return type + ':' + path
500
501
Guido van Rossum7c395db1994-07-04 22:14:49 +0000502# Utilities to parse URLs (most of these return None for missing parts):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000503# unwrap('<URL:type//host/path>') --> 'type//host/path'
504# splittype('type:opaquestring') --> 'type', 'opaquestring'
505# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000506# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
507# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000508# splitport('host:port') --> 'host', 'port'
509# splitquery('/path?query') --> '/path', 'query'
510# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000511# splitattr('/path;attr1=value1;attr2=value2;...') ->
512# '/path', ['attr1=value1', 'attr2=value2', ...]
513# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000514# splitgophertype('/Xselector') --> 'X', 'selector'
515# unquote('abc%20def') -> 'abc def'
516# quote('abc def') -> 'abc%20def')
517
518def unwrap(url):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000519 url = string.strip(url)
520 if url[:1] == '<' and url[-1:] == '>':
521 url = string.strip(url[1:-1])
522 if url[:4] == 'URL:': url = string.strip(url[4:])
523 return url
524
525_typeprog = regex.compile('^\([^/:]+\):\(.*\)$')
526def splittype(url):
527 if _typeprog.match(url) >= 0: return _typeprog.group(1, 2)
528 return None, url
529
530_hostprog = regex.compile('^//\([^/]+\)\(.*\)$')
531def splithost(url):
532 if _hostprog.match(url) >= 0: return _hostprog.group(1, 2)
533 return None, url
534
Guido van Rossum7c395db1994-07-04 22:14:49 +0000535_userprog = regex.compile('^\([^@]*\)@\(.*\)$')
536def splituser(host):
537 if _userprog.match(host) >= 0: return _userprog.group(1, 2)
538 return None, host
539
540_passwdprog = regex.compile('^\([^:]*\):\(.*\)$')
541def splitpasswd(user):
542 if _passwdprog.match(user) >= 0: return _passwdprog.group(1, 2)
543 return user, None
544
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000545_portprog = regex.compile('^\(.*\):\([0-9]+\)$')
546def splitport(host):
547 if _portprog.match(host) >= 0: return _portprog.group(1, 2)
548 return host, None
549
550_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$')
551def splitquery(url):
552 if _queryprog.match(url) >= 0: return _queryprog.group(1, 2)
553 return url, None
554
555_tagprog = regex.compile('^\(.*\)#\([^#]*\)$')
556def splittag(url):
557 if _tagprog.match(url) >= 0: return _tagprog.group(1, 2)
558 return url, None
559
Guido van Rossum7c395db1994-07-04 22:14:49 +0000560def splitattr(url):
561 words = string.splitfields(url, ';')
562 return words[0], words[1:]
563
564_valueprog = regex.compile('^\([^=]*\)=\(.*\)$')
565def splitvalue(attr):
566 if _valueprog.match(attr) >= 0: return _valueprog.group(1, 2)
567 return attr, None
568
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000569def splitgophertype(selector):
570 if selector[:1] == '/' and selector[1:2]:
571 return selector[1], selector[2:]
572 return None, selector
573
574_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]')
575def unquote(s):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000576 i = 0
577 n = len(s)
578 res = ''
579 while 0 <= i < n:
580 j = _quoteprog.search(s, i)
581 if j < 0:
582 res = res + s[i:]
583 break
584 res = res + (s[i:j] + chr(eval('0x' + s[j+1:j+3])))
585 i = j+3
586 return res
587
Guido van Rossum3bb54481994-08-29 10:52:58 +0000588always_safe = string.letters + string.digits + '_,.-'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000589def quote(s, safe = '/'):
590 safe = always_safe + safe
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000591 res = ''
592 for c in s:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000593 if c in safe:
594 res = res + c
595 else:
596 res = res + '%%%02x' % ord(c)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000597 return res
598
599# Test and time quote() and unquote()
600def test1():
601 import time
602 s = ''
603 for i in range(256): s = s + chr(i)
604 s = s*4
605 t0 = time.time()
606 qs = quote(s)
607 uqs = unquote(qs)
608 t1 = time.time()
609 if uqs != s:
610 print 'Wrong!'
611 print `s`
612 print `qs`
613 print `uqs`
614 print round(t1 - t0, 3), 'sec'
615
616
617# Test program
618def test():
619 import sys
620 import regsub
621 args = sys.argv[1:]
622 if not args:
623 args = [
624 '/etc/passwd',
625 'file:/etc/passwd',
626 'file://localhost/etc/passwd',
627 'ftp://ftp.cwi.nl/etc/passwd',
628 'gopher://gopher.cwi.nl/11/',
629 'http://www.cwi.nl/index.html',
630 ]
631 try:
632 for url in args:
633 print '-'*10, url, '-'*10
634 fn, h = urlretrieve(url)
635 print fn, h
636 if h:
637 print '======'
638 for k in h.keys(): print k + ':', h[k]
639 print '======'
640 fp = open(fn, 'r')
641 data = fp.read()
642 del fp
643 print regsub.gsub('\r', '', data)
644 fn, h = None, None
645 print '-'*40
646 finally:
647 urlcleanup()
648
649# Run test program when run as a script
650if __name__ == '__main__':
Guido van Rossum7c395db1994-07-04 22:14:49 +0000651## test1()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000652 test()