blob: 7168a5121c0966f6dd3c7c8fd1a4305fb39aacf0 [file] [log] [blame]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001# Open an arbitrary URL
2#
3# See the following document for a tentative description of URLs:
4# Uniform Resource Locators Tim Berners-Lee
5# INTERNET DRAFT CERN
6# IETF URL Working Group 14 July 1993
7# draft-ietf-uri-url-01.txt
8#
9# The object returned by URLopener().open(file) will differ per
10# protocol. All you know is that is has methods read(), readline(),
11# readlines(), fileno(), close() and info(). The read*(), fileno()
12# and close() methods work like those of open files.
Guido van Rossumbbb0a051995-08-04 04:29:05 +000013# The info() method returns an mimetools.Message object which can be
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000014# used to query various info about the object, if available.
Guido van Rossumbbb0a051995-08-04 04:29:05 +000015# (mimetools.Message objects are queried with the getheader() method.)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000016
Guido van Rossum7c395db1994-07-04 22:14:49 +000017import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000018import socket
19import regex
20
21
Guido van Rossumca445401995-08-29 19:19:12 +000022__version__ = '1.2'
Guido van Rossum6cb15a01995-06-22 19:00:13 +000023
24
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000025# This really consists of two pieces:
26# (1) a class which handles opening of all sorts of URLs
27# (plus assorted utilities etc.)
28# (2) a set of functions for parsing URLs
29# XXX Should these be separated out into different modules?
30
31
32# Shortcut for basic usage
33_urlopener = None
34def urlopen(url):
35 global _urlopener
36 if not _urlopener:
Guido van Rossumbbb0a051995-08-04 04:29:05 +000037 _urlopener = FancyURLopener()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000038 return _urlopener.open(url)
39def urlretrieve(url):
40 global _urlopener
41 if not _urlopener:
Guido van Rossumbbb0a051995-08-04 04:29:05 +000042 _urlopener = FancyURLopener()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000043 return _urlopener.retrieve(url)
44def urlcleanup():
45 if _urlopener:
46 _urlopener.cleanup()
47
48
49# Class to open URLs.
50# This is a class rather than just a subroutine because we may need
51# more than one set of global protocol-specific options.
Guido van Rossumbbb0a051995-08-04 04:29:05 +000052# Note -- this is a base class for those who don't want the
53# automatic handling of errors type 302 (relocated) and 401
54# (authorization needed).
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000055ftpcache = {}
56class URLopener:
57
58 # Constructor
59 def __init__(self):
Guido van Rossum6cb15a01995-06-22 19:00:13 +000060 server_version = "Python-urllib/%s" % __version__
61 self.addheaders = [('User-agent', server_version)]
Guido van Rossum7aeb4b91994-08-23 13:32:20 +000062 self.tempcache = None
63 # Undocumented feature: if you assign {} to tempcache,
64 # it is used to cache files retrieved with
65 # self.retrieve(). This is not enabled by default
66 # since it does not work for changing documents (and I
67 # haven't got the logic to check expiration headers
68 # yet).
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000069 self.ftpcache = ftpcache
70 # Undocumented feature: you can use a different
71 # ftp cache by assigning to the .ftpcache member;
72 # in case you want logically independent URL openers
73
74 def __del__(self):
75 self.close()
76
77 def close(self):
78 self.cleanup()
79
80 def cleanup(self):
81 import os
Guido van Rossum7aeb4b91994-08-23 13:32:20 +000082 if self.tempcache:
83 for url in self.tempcache.keys():
84 try:
85 os.unlink(self.tempcache[url][0])
86 except os.error:
87 pass
88 del self.tempcache[url]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000089
90 # Add a header to be used by the HTTP interface only
91 # e.g. u.addheader('Accept', 'sound/basic')
92 def addheader(self, *args):
93 self.addheaders.append(args)
94
95 # External interface
96 # Use URLopener().open(file) instead of open(file, 'r')
Guido van Rossumca445401995-08-29 19:19:12 +000097 def open(self, fullurl):
98 fullurl = unwrap(fullurl)
99 type, url = splittype(fullurl)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000100 if not type: type = 'file'
101 name = 'open_' + type
102 if '-' in name:
103 import regsub
104 name = regsub.gsub('-', '_', name)
105 if not hasattr(self, name):
Guido van Rossumca445401995-08-29 19:19:12 +0000106 return self.open_unknown(fullurl)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000107 try:
108 return getattr(self, name)(url)
109 except socket.error, msg:
110 raise IOError, ('socket error', msg)
111
Guido van Rossumca445401995-08-29 19:19:12 +0000112 # Overridable interface to open unknown URL type
113 def open_unknown(self, fullurl):
114 type, url = splittype(fullurl)
115 raise IOError, ('url error', 'unknown url type', type)
116
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000117 # External interface
118 # retrieve(url) returns (filename, None) for a local object
119 # or (tempfilename, headers) for a remote object
120 def retrieve(self, url):
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000121 if self.tempcache and self.tempcache.has_key(url):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000122 return self.tempcache[url]
123 url1 = unwrap(url)
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000124 if self.tempcache and self.tempcache.has_key(url1):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000125 self.tempcache[url] = self.tempcache[url1]
126 return self.tempcache[url1]
127 type, url1 = splittype(url1)
128 if not type or type == 'file':
129 try:
130 fp = self.open_local_file(url1)
131 del fp
132 return splithost(url1)[1], None
133 except IOError, msg:
134 pass
135 fp = self.open(url)
136 headers = fp.info()
137 import tempfile
138 tfn = tempfile.mktemp()
Guido van Rossumfa59e831994-09-21 11:36:19 +0000139 result = tfn, headers
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000140 if self.tempcache is not None:
Guido van Rossumfa59e831994-09-21 11:36:19 +0000141 self.tempcache[url] = result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000142 tfp = open(tfn, 'w')
143 bs = 1024*8
144 block = fp.read(bs)
145 while block:
146 tfp.write(block)
147 block = fp.read(bs)
148 del fp
149 del tfp
150 return result
151
152 # Each method named open_<type> knows how to open that type of URL
153
154 # Use HTTP protocol
155 def open_http(self, url):
156 import httplib
157 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000158 if not host: raise IOError, ('http error', 'no host given')
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000159 i = string.find(host, '@')
160 if i >= 0:
161 user_passwd, host = host[:i], host[i+1:]
162 else:
163 user_passwd = None
164 if user_passwd:
165 import base64
166 auth = string.strip(base64.encodestring(user_passwd))
167 else:
168 auth = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000169 h = httplib.HTTP(host)
170 h.putrequest('GET', selector)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000171 if auth: h.putheader('Authorization: Basic %s' % auth)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000172 for args in self.addheaders: apply(h.putheader, args)
Guido van Rossum6cb15a01995-06-22 19:00:13 +0000173 h.endheaders()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000174 errcode, errmsg, headers = h.getreply()
Guido van Rossum6cb15a01995-06-22 19:00:13 +0000175 fp = h.getfile()
176 if errcode == 200:
177 return addinfo(fp, headers)
178 else:
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000179 return self.http_error(url,
180 fp, errcode, errmsg, headers)
181
182 # Handle http errors.
183 # Derived class can override this, or provide specific handlers
184 # named http_error_DDD where DDD is the 3-digit error code
185 def http_error(self, url, fp, errcode, errmsg, headers):
186 # First check if there's a specific handler for this error
187 name = 'http_error_%d' % errcode
188 if hasattr(self, name):
189 method = getattr(self, name)
190 result = method(url, fp, errcode, errmsg, headers)
191 if result: return result
192 return self.http_error_default(
193 url, fp, errcode, errmsg, headers)
194
195 # Default http error handler: close the connection and raises IOError
196 def http_error_default(self, url, fp, errcode, errmsg, headers):
197 void = fp.read()
198 fp.close()
199 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000200
201 # Use Gopher protocol
202 def open_gopher(self, url):
203 import gopherlib
204 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000205 if not host: raise IOError, ('gopher error', 'no host given')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000206 type, selector = splitgophertype(selector)
207 selector, query = splitquery(selector)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000208 selector = unquote(selector)
209 if query:
210 query = unquote(query)
211 fp = gopherlib.send_query(selector, query, host)
212 else:
213 fp = gopherlib.send_selector(selector, host)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000214 return addinfo(fp, noheaders())
215
216 # Use local file or FTP depending on form of URL
217 def open_file(self, url):
Guido van Rossumca445401995-08-29 19:19:12 +0000218 if url[:2] == '//':
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000219 return self.open_ftp(url)
Guido van Rossumca445401995-08-29 19:19:12 +0000220 else:
221 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000222
223 # Use local file
224 def open_local_file(self, url):
225 host, file = splithost(url)
226 if not host: return addinfo(open(file, 'r'), noheaders())
227 host, port = splitport(host)
228 if not port and socket.gethostbyname(host) in (
229 localhost(), thishost()):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000230 file = unquote(file)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000231 return addinfo(open(file, 'r'), noheaders())
232 raise IOError, ('local file error', 'not on local host')
233
234 # Use FTP protocol
235 def open_ftp(self, url):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000236 host, path = splithost(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000237 if not host: raise IOError, ('ftp error', 'no host given')
238 host, port = splitport(host)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000239 user, host = splituser(host)
240 if user: user, passwd = splitpasswd(user)
241 else: passwd = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000242 host = socket.gethostbyname(host)
243 if not port:
244 import ftplib
245 port = ftplib.FTP_PORT
Guido van Rossum7c395db1994-07-04 22:14:49 +0000246 path, attrs = splitattr(path)
247 dirs = string.splitfields(path, '/')
248 dirs, file = dirs[:-1], dirs[-1]
249 if dirs and not dirs[0]: dirs = dirs[1:]
250 key = (user, host, port, string.joinfields(dirs, '/'))
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000251 try:
252 if not self.ftpcache.has_key(key):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000253 self.ftpcache[key] = \
254 ftpwrapper(user, passwd,
255 host, port, dirs)
256 if not file: type = 'D'
257 else: type = 'I'
258 for attr in attrs:
259 attr, value = splitvalue(attr)
260 if string.lower(attr) == 'type' and \
261 value in ('a', 'A', 'i', 'I', 'd', 'D'):
262 type = string.upper(value)
263 return addinfo(self.ftpcache[key].retrfile(file, type),
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000264 noheaders())
265 except ftperrors(), msg:
266 raise IOError, ('ftp error', msg)
267
268
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000269# Derived class with handlers for errors we can handle (perhaps)
270class FancyURLopener(URLopener):
271
272 def __init__(self, *args):
273 apply(URLopener.__init__, (self,) + args)
274 self.auth_cache = {}
275
276 # Default error handling -- don't raise an exception
277 def http_error_default(self, url, fp, errcode, errmsg, headers):
278 return addinfo(fp, headers)
279
280 # Error 302 -- relocated
281 def http_error_302(self, url, fp, errcode, errmsg, headers):
282 # XXX The server can force infinite recursion here!
283 if headers.has_key('location'):
284 newurl = headers['location']
285 elif headers.has_key('uri'):
286 newurl = headers['uri']
287 else:
288 return
289 void = fp.read()
290 fp.close()
291 return self.open(newurl)
292
293 # Error 401 -- authentication required
294 # See this URL for a description of the basic authentication scheme:
295 # http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt
296 def http_error_401(self, url, fp, errcode, errmsg, headers):
297 if headers.has_key('www-authenticate'):
298 stuff = headers['www-authenticate']
299 p = regex.compile(
300 '[ \t]*\([^ \t]+\)[ \t]+realm="\([^"]*\)"')
301 if p.match(stuff) >= 0:
302 scheme, realm = p.group(1, 2)
303 if string.lower(scheme) == 'basic':
304 return self.retry_http_basic_auth(
305 url, realm)
306
307 def retry_http_basic_auth(self, url, realm):
308 host, selector = splithost(url)
309 i = string.find(host, '@') + 1
310 host = host[i:]
311 user, passwd = self.get_user_passwd(host, realm, i)
312 if not (user or passwd): return None
313 host = user + ':' + passwd + '@' + host
314 newurl = '//' + host + selector
315 return self.open_http(newurl)
316
317 def get_user_passwd(self, host, realm, clear_cache = 0):
318 key = realm + '@' + string.lower(host)
319 if self.auth_cache.has_key(key):
320 if clear_cache:
321 del self.auth_cache[key]
322 else:
323 return self.auth_cache[key]
324 user, passwd = self.prompt_user_passwd(host, realm)
325 if user or passwd: self.auth_cache[key] = (user, passwd)
326 return user, passwd
327
328 def prompt_user_passwd(self, host, realm):
329 # Override this in a GUI environment!
330 try:
331 user = raw_input("Enter username for %s at %s: " %
332 (realm, host))
333 self.echo_off()
334 try:
335 passwd = raw_input(
336 "Enter password for %s in %s at %s: " %
337 (user, realm, host))
338 finally:
339 self.echo_on()
340 return user, passwd
341 except KeyboardInterrupt:
342 return None, None
343
344 def echo_off(self):
345 import os
346 os.system("stty -echo")
347
348 def echo_on(self):
349 import os
350 print
351 os.system("stty echo")
352
353
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000354# Utility functions
355
356# Return the IP address of the magic hostname 'localhost'
357_localhost = None
358def localhost():
359 global _localhost
360 if not _localhost:
361 _localhost = socket.gethostbyname('localhost')
362 return _localhost
363
364# Return the IP address of the current host
365_thishost = None
366def thishost():
367 global _thishost
368 if not _thishost:
369 _thishost = socket.gethostbyname(socket.gethostname())
370 return _thishost
371
372# Return the set of errors raised by the FTP class
373_ftperrors = None
374def ftperrors():
375 global _ftperrors
376 if not _ftperrors:
377 import ftplib
378 _ftperrors = (ftplib.error_reply,
379 ftplib.error_temp,
380 ftplib.error_perm,
381 ftplib.error_proto)
382 return _ftperrors
383
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000384# Return an empty mimetools.Message object
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000385_noheaders = None
386def noheaders():
387 global _noheaders
388 if not _noheaders:
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000389 import mimetools
390 import StringIO
391 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000392 _noheaders.fp.close() # Recycle file descriptor
393 return _noheaders
394
395
396# Utility classes
397
398# Class used by open_ftp() for cache of open FTP connections
399class ftpwrapper:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000400 def __init__(self, user, passwd, host, port, dirs):
401 self.user = unquote(user or '')
402 self.passwd = unquote(passwd or '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000403 self.host = host
404 self.port = port
Guido van Rossum7c395db1994-07-04 22:14:49 +0000405 self.dirs = []
406 for dir in dirs:
407 self.dirs.append(unquote(dir))
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000408 self.init()
409 def init(self):
410 import ftplib
411 self.ftp = ftplib.FTP()
412 self.ftp.connect(self.host, self.port)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000413 self.ftp.login(self.user, self.passwd)
414 for dir in self.dirs:
415 self.ftp.cwd(dir)
416 def retrfile(self, file, type):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000417 import ftplib
Guido van Rossum7c395db1994-07-04 22:14:49 +0000418 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
419 else: cmd = 'TYPE ' + type; isdir = 0
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000420 try:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000421 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000422 except ftplib.all_errors:
423 self.init()
Guido van Rossum7c395db1994-07-04 22:14:49 +0000424 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000425 conn = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000426 if file and not isdir:
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000427 try:
428 cmd = 'RETR ' + file
429 conn = self.ftp.transfercmd(cmd)
430 except ftplib.error_perm, reason:
431 if reason[:3] != '550':
432 raise IOError, ('ftp error', reason)
433 if not conn:
434 # Try a directory listing
435 if file: cmd = 'LIST ' + file
436 else: cmd = 'LIST'
437 conn = self.ftp.transfercmd(cmd)
438 return addclosehook(conn.makefile('r'), self.ftp.voidresp)
439
440# Base class for addinfo and addclosehook
441class addbase:
442 def __init__(self, fp):
443 self.fp = fp
444 self.read = self.fp.read
445 self.readline = self.fp.readline
446 self.readlines = self.fp.readlines
447 self.fileno = self.fp.fileno
448 def __repr__(self):
449 return '<%s at %s whose fp = %s>' % (
450 self.__class__.__name__, `id(self)`, `self.fp`)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000451 def close(self):
452 self.read = None
453 self.readline = None
454 self.readlines = None
455 self.fileno = None
Guido van Rossum6cb15a01995-06-22 19:00:13 +0000456 if self.fp: self.fp.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000457 self.fp = None
458
459# Class to add a close hook to an open file
460class addclosehook(addbase):
461 def __init__(self, fp, closehook, *hookargs):
462 addbase.__init__(self, fp)
463 self.closehook = closehook
464 self.hookargs = hookargs
465 def close(self):
466 if self.closehook:
467 apply(self.closehook, self.hookargs)
468 self.closehook = None
469 self.hookargs = None
470 addbase.close(self)
471
472# class to add an info() method to an open file
473class addinfo(addbase):
474 def __init__(self, fp, headers):
475 addbase.__init__(self, fp)
476 self.headers = headers
477 def info(self):
478 return self.headers
479
480
481# Utility to combine a URL with a base URL to form a new URL
482
483def basejoin(base, url):
484 type, path = splittype(url)
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000485 if type:
486 # if url is complete (i.e., it contains a type), return it
487 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000488 host, path = splithost(path)
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000489 type, basepath = splittype(base) # inherit type from base
490 if host:
491 # if url contains host, just inherit type
492 if type: return type + '://' + host + path
493 else:
494 # no type inherited, so url must have started with //
495 # just return it
496 return url
497 host, basepath = splithost(basepath) # inherit host
498 basepath, basetag = splittag(basepath) # remove extraneuous cruft
499 basepath, basequery = splitquery(basepath) # idem
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000500 if path[:1] != '/':
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000501 # non-absolute path name
502 if path[:1] in ('#', '?'):
503 # path is just a tag or query, attach to basepath
504 i = len(basepath)
505 else:
506 # else replace last component
507 i = string.rfind(basepath, '/')
508 if i < 0:
509 # basepath not absolute
510 if host:
511 # host present, make absolute
512 basepath = '/'
513 else:
514 # else keep non-absolute
515 basepath = ''
516 else:
517 # remove last file component
518 basepath = basepath[:i+1]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000519 path = basepath + path
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000520 if type and host: return type + '://' + host + path
521 elif type: return type + ':' + path
522 elif host: return '//' + host + path # don't know what this means
523 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000524
525
Guido van Rossum7c395db1994-07-04 22:14:49 +0000526# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000527# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000528# splittype('type:opaquestring') --> 'type', 'opaquestring'
529# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000530# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
531# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000532# splitport('host:port') --> 'host', 'port'
533# splitquery('/path?query') --> '/path', 'query'
534# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000535# splitattr('/path;attr1=value1;attr2=value2;...') ->
536# '/path', ['attr1=value1', 'attr2=value2', ...]
537# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000538# splitgophertype('/Xselector') --> 'X', 'selector'
539# unquote('abc%20def') -> 'abc def'
540# quote('abc def') -> 'abc%20def')
541
542def unwrap(url):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000543 url = string.strip(url)
544 if url[:1] == '<' and url[-1:] == '>':
545 url = string.strip(url[1:-1])
546 if url[:4] == 'URL:': url = string.strip(url[4:])
547 return url
548
549_typeprog = regex.compile('^\([^/:]+\):\(.*\)$')
550def splittype(url):
551 if _typeprog.match(url) >= 0: return _typeprog.group(1, 2)
552 return None, url
553
554_hostprog = regex.compile('^//\([^/]+\)\(.*\)$')
555def splithost(url):
556 if _hostprog.match(url) >= 0: return _hostprog.group(1, 2)
557 return None, url
558
Guido van Rossum7c395db1994-07-04 22:14:49 +0000559_userprog = regex.compile('^\([^@]*\)@\(.*\)$')
560def splituser(host):
561 if _userprog.match(host) >= 0: return _userprog.group(1, 2)
562 return None, host
563
564_passwdprog = regex.compile('^\([^:]*\):\(.*\)$')
565def splitpasswd(user):
566 if _passwdprog.match(user) >= 0: return _passwdprog.group(1, 2)
567 return user, None
568
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000569_portprog = regex.compile('^\(.*\):\([0-9]+\)$')
570def splitport(host):
571 if _portprog.match(host) >= 0: return _portprog.group(1, 2)
572 return host, None
573
574_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$')
575def splitquery(url):
576 if _queryprog.match(url) >= 0: return _queryprog.group(1, 2)
577 return url, None
578
579_tagprog = regex.compile('^\(.*\)#\([^#]*\)$')
580def splittag(url):
581 if _tagprog.match(url) >= 0: return _tagprog.group(1, 2)
582 return url, None
583
Guido van Rossum7c395db1994-07-04 22:14:49 +0000584def splitattr(url):
585 words = string.splitfields(url, ';')
586 return words[0], words[1:]
587
588_valueprog = regex.compile('^\([^=]*\)=\(.*\)$')
589def splitvalue(attr):
590 if _valueprog.match(attr) >= 0: return _valueprog.group(1, 2)
591 return attr, None
592
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000593def splitgophertype(selector):
594 if selector[:1] == '/' and selector[1:2]:
595 return selector[1], selector[2:]
596 return None, selector
597
598_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]')
599def unquote(s):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000600 i = 0
601 n = len(s)
602 res = ''
603 while 0 <= i < n:
604 j = _quoteprog.search(s, i)
605 if j < 0:
606 res = res + s[i:]
607 break
608 res = res + (s[i:j] + chr(eval('0x' + s[j+1:j+3])))
609 i = j+3
610 return res
611
Guido van Rossum3bb54481994-08-29 10:52:58 +0000612always_safe = string.letters + string.digits + '_,.-'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000613def quote(s, safe = '/'):
614 safe = always_safe + safe
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000615 res = ''
616 for c in s:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000617 if c in safe:
618 res = res + c
619 else:
620 res = res + '%%%02x' % ord(c)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000621 return res
622
623# Test and time quote() and unquote()
624def test1():
625 import time
626 s = ''
627 for i in range(256): s = s + chr(i)
628 s = s*4
629 t0 = time.time()
630 qs = quote(s)
631 uqs = unquote(qs)
632 t1 = time.time()
633 if uqs != s:
634 print 'Wrong!'
635 print `s`
636 print `qs`
637 print `uqs`
638 print round(t1 - t0, 3), 'sec'
639
640
641# Test program
642def test():
643 import sys
644 import regsub
645 args = sys.argv[1:]
646 if not args:
647 args = [
648 '/etc/passwd',
649 'file:/etc/passwd',
650 'file://localhost/etc/passwd',
651 'ftp://ftp.cwi.nl/etc/passwd',
652 'gopher://gopher.cwi.nl/11/',
653 'http://www.cwi.nl/index.html',
654 ]
655 try:
656 for url in args:
657 print '-'*10, url, '-'*10
658 fn, h = urlretrieve(url)
659 print fn, h
660 if h:
661 print '======'
662 for k in h.keys(): print k + ':', h[k]
663 print '======'
664 fp = open(fn, 'r')
665 data = fp.read()
666 del fp
667 print regsub.gsub('\r', '', data)
668 fn, h = None, None
669 print '-'*40
670 finally:
671 urlcleanup()
672
673# Run test program when run as a script
674if __name__ == '__main__':
Guido van Rossum7c395db1994-07-04 22:14:49 +0000675## test1()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000676 test()