blob: 31636966d77558e1828d860a0556f1aad27b4189 [file] [log] [blame]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001# Open an arbitrary URL
2#
3# See the following document for a tentative description of URLs:
4# Uniform Resource Locators Tim Berners-Lee
5# INTERNET DRAFT CERN
6# IETF URL Working Group 14 July 1993
7# draft-ietf-uri-url-01.txt
8#
9# The object returned by URLopener().open(file) will differ per
10# protocol. All you know is that is has methods read(), readline(),
11# readlines(), fileno(), close() and info(). The read*(), fileno()
12# and close() methods work like those of open files.
Guido van Rossumbbb0a051995-08-04 04:29:05 +000013# The info() method returns an mimetools.Message object which can be
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000014# used to query various info about the object, if available.
Guido van Rossumbbb0a051995-08-04 04:29:05 +000015# (mimetools.Message objects are queried with the getheader() method.)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000016
Guido van Rossum7c395db1994-07-04 22:14:49 +000017import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000018import socket
19import regex
Jack Jansendc3e3f61995-12-15 13:22:13 +000020import os
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000021
22
Jack Jansene8ea21b1995-12-21 15:43:53 +000023__version__ = '1.2' # XXXX Should I update this number? -- jack
Guido van Rossum6cb15a01995-06-22 19:00:13 +000024
Jack Jansendc3e3f61995-12-15 13:22:13 +000025# Helper for non-unix systems
26if os.name == 'mac':
Jack Jansene8ea21b1995-12-21 15:43:53 +000027 def url2pathname(pathname):
28 "Convert /-delimited pathname to mac pathname"
29 #
30 # XXXX The .. handling should be fixed...
31 #
32 tp = splittype(pathname)[0]
33 if tp and tp <> 'file':
34 raise RuntimeError, 'Cannot convert non-local URL to pathname'
Jack Jansendc3e3f61995-12-15 13:22:13 +000035 components = string.split(pathname, '/')
Jack Jansen0d12ead1996-02-14 16:05:20 +000036 i = 0
37 while i < len(components):
38 if components[i] == '.':
39 del components[i]
40 elif components[i] == '..' and i > 0 and \
41 components[i-1] not in ('', '..'):
42 del components[i-1:i+1]
43 i = i-1
44 elif components[i] == '' and i > 0 and components[i-1] <> '':
45 del components[i]
46 else:
47 i = i+1
48 if not components or '..' in components or '.' in components or '' in components[1:-1]:
49 raise RuntimeError, 'Cannot normalize URL containing ., .. or // to pathname'
Jack Jansendc3e3f61995-12-15 13:22:13 +000050 if not components[0]:
51 # Absolute unix path, don't start with colon
52 return string.join(components[1:], ':')
53 else:
54 # relative unix path, start with colon
55 return ':' + string.join(components, ':')
Jack Jansene8ea21b1995-12-21 15:43:53 +000056
57 def pathname2url(pathname):
58 "convert mac pathname to /-delimited pathname"
59 if '/' in pathname:
60 raise RuntimeError, "Cannot convert pathname containing slashes"
61 components = string.split(pathname, ':')
62 if '' in components[1:-1]:
63 raise RuntimeError, "Cannot convert pathname containing ::"
64 # Truncate names longer than 31 bytes
65 components = map(lambda x: x[:31], components)
66
67 if os.path.isabs(pathname):
68 return '/' + string.join(components, '/')
69 else:
70 return string.join(components, '/')
Jack Jansendc3e3f61995-12-15 13:22:13 +000071else:
Jack Jansene8ea21b1995-12-21 15:43:53 +000072 def url2pathname(pathname):
Jack Jansendc3e3f61995-12-15 13:22:13 +000073 return pathname
Jack Jansene8ea21b1995-12-21 15:43:53 +000074 def pathname2url(pathname):
75 return pathname
Guido van Rossum6cb15a01995-06-22 19:00:13 +000076
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000077# This really consists of two pieces:
78# (1) a class which handles opening of all sorts of URLs
79# (plus assorted utilities etc.)
80# (2) a set of functions for parsing URLs
81# XXX Should these be separated out into different modules?
82
83
84# Shortcut for basic usage
85_urlopener = None
86def urlopen(url):
87 global _urlopener
88 if not _urlopener:
Guido van Rossumbbb0a051995-08-04 04:29:05 +000089 _urlopener = FancyURLopener()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000090 return _urlopener.open(url)
91def urlretrieve(url):
92 global _urlopener
93 if not _urlopener:
Guido van Rossumbbb0a051995-08-04 04:29:05 +000094 _urlopener = FancyURLopener()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000095 return _urlopener.retrieve(url)
96def urlcleanup():
97 if _urlopener:
98 _urlopener.cleanup()
99
100
101# Class to open URLs.
102# This is a class rather than just a subroutine because we may need
103# more than one set of global protocol-specific options.
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000104# Note -- this is a base class for those who don't want the
105# automatic handling of errors type 302 (relocated) and 401
106# (authorization needed).
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000107ftpcache = {}
108class URLopener:
109
110 # Constructor
111 def __init__(self):
Guido van Rossum6cb15a01995-06-22 19:00:13 +0000112 server_version = "Python-urllib/%s" % __version__
113 self.addheaders = [('User-agent', server_version)]
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000114 self.tempcache = None
115 # Undocumented feature: if you assign {} to tempcache,
116 # it is used to cache files retrieved with
117 # self.retrieve(). This is not enabled by default
118 # since it does not work for changing documents (and I
119 # haven't got the logic to check expiration headers
120 # yet).
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000121 self.ftpcache = ftpcache
122 # Undocumented feature: you can use a different
123 # ftp cache by assigning to the .ftpcache member;
124 # in case you want logically independent URL openers
125
126 def __del__(self):
127 self.close()
128
129 def close(self):
130 self.cleanup()
131
132 def cleanup(self):
133 import os
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000134 if self.tempcache:
135 for url in self.tempcache.keys():
136 try:
137 os.unlink(self.tempcache[url][0])
138 except os.error:
139 pass
140 del self.tempcache[url]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000141
142 # Add a header to be used by the HTTP interface only
143 # e.g. u.addheader('Accept', 'sound/basic')
144 def addheader(self, *args):
145 self.addheaders.append(args)
146
147 # External interface
148 # Use URLopener().open(file) instead of open(file, 'r')
Guido van Rossumca445401995-08-29 19:19:12 +0000149 def open(self, fullurl):
150 fullurl = unwrap(fullurl)
151 type, url = splittype(fullurl)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000152 if not type: type = 'file'
153 name = 'open_' + type
154 if '-' in name:
155 import regsub
156 name = regsub.gsub('-', '_', name)
157 if not hasattr(self, name):
Guido van Rossumca445401995-08-29 19:19:12 +0000158 return self.open_unknown(fullurl)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000159 try:
160 return getattr(self, name)(url)
161 except socket.error, msg:
162 raise IOError, ('socket error', msg)
163
Guido van Rossumca445401995-08-29 19:19:12 +0000164 # Overridable interface to open unknown URL type
165 def open_unknown(self, fullurl):
166 type, url = splittype(fullurl)
167 raise IOError, ('url error', 'unknown url type', type)
168
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000169 # External interface
170 # retrieve(url) returns (filename, None) for a local object
171 # or (tempfilename, headers) for a remote object
172 def retrieve(self, url):
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000173 if self.tempcache and self.tempcache.has_key(url):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000174 return self.tempcache[url]
175 url1 = unwrap(url)
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000176 if self.tempcache and self.tempcache.has_key(url1):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000177 self.tempcache[url] = self.tempcache[url1]
178 return self.tempcache[url1]
179 type, url1 = splittype(url1)
180 if not type or type == 'file':
181 try:
182 fp = self.open_local_file(url1)
183 del fp
Jack Jansene8ea21b1995-12-21 15:43:53 +0000184 return url2pathname(splithost(url1)[1]), None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000185 except IOError, msg:
186 pass
187 fp = self.open(url)
188 headers = fp.info()
189 import tempfile
190 tfn = tempfile.mktemp()
Guido van Rossumfa59e831994-09-21 11:36:19 +0000191 result = tfn, headers
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000192 if self.tempcache is not None:
Guido van Rossumfa59e831994-09-21 11:36:19 +0000193 self.tempcache[url] = result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000194 tfp = open(tfn, 'w')
195 bs = 1024*8
196 block = fp.read(bs)
197 while block:
198 tfp.write(block)
199 block = fp.read(bs)
200 del fp
201 del tfp
202 return result
203
204 # Each method named open_<type> knows how to open that type of URL
205
206 # Use HTTP protocol
207 def open_http(self, url):
208 import httplib
209 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000210 if not host: raise IOError, ('http error', 'no host given')
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000211 i = string.find(host, '@')
212 if i >= 0:
213 user_passwd, host = host[:i], host[i+1:]
214 else:
215 user_passwd = None
216 if user_passwd:
217 import base64
218 auth = string.strip(base64.encodestring(user_passwd))
219 else:
220 auth = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000221 h = httplib.HTTP(host)
222 h.putrequest('GET', selector)
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000223 if auth: h.putheader('Authorization: Basic %s' % auth)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000224 for args in self.addheaders: apply(h.putheader, args)
Guido van Rossum6cb15a01995-06-22 19:00:13 +0000225 h.endheaders()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000226 errcode, errmsg, headers = h.getreply()
Guido van Rossum6cb15a01995-06-22 19:00:13 +0000227 fp = h.getfile()
228 if errcode == 200:
229 return addinfo(fp, headers)
230 else:
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000231 return self.http_error(url,
232 fp, errcode, errmsg, headers)
233
234 # Handle http errors.
235 # Derived class can override this, or provide specific handlers
236 # named http_error_DDD where DDD is the 3-digit error code
237 def http_error(self, url, fp, errcode, errmsg, headers):
238 # First check if there's a specific handler for this error
239 name = 'http_error_%d' % errcode
240 if hasattr(self, name):
241 method = getattr(self, name)
242 result = method(url, fp, errcode, errmsg, headers)
243 if result: return result
244 return self.http_error_default(
245 url, fp, errcode, errmsg, headers)
246
247 # Default http error handler: close the connection and raises IOError
248 def http_error_default(self, url, fp, errcode, errmsg, headers):
249 void = fp.read()
250 fp.close()
251 raise IOError, ('http error', errcode, errmsg, headers)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000252
253 # Use Gopher protocol
254 def open_gopher(self, url):
255 import gopherlib
256 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000257 if not host: raise IOError, ('gopher error', 'no host given')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000258 type, selector = splitgophertype(selector)
259 selector, query = splitquery(selector)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000260 selector = unquote(selector)
261 if query:
262 query = unquote(query)
263 fp = gopherlib.send_query(selector, query, host)
264 else:
265 fp = gopherlib.send_selector(selector, host)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000266 return addinfo(fp, noheaders())
267
268 # Use local file or FTP depending on form of URL
269 def open_file(self, url):
Guido van Rossumca445401995-08-29 19:19:12 +0000270 if url[:2] == '//':
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000271 return self.open_ftp(url)
Guido van Rossumca445401995-08-29 19:19:12 +0000272 else:
273 return self.open_local_file(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000274
275 # Use local file
276 def open_local_file(self, url):
277 host, file = splithost(url)
Jack Jansene8ea21b1995-12-21 15:43:53 +0000278 if not host: return addinfo(open(url2pathname(file), 'r'), noheaders())
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000279 host, port = splitport(host)
280 if not port and socket.gethostbyname(host) in (
281 localhost(), thishost()):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000282 file = unquote(file)
Jack Jansene8ea21b1995-12-21 15:43:53 +0000283 return addinfo(open(url2pathname(file), 'r'), noheaders())
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000284 raise IOError, ('local file error', 'not on local host')
285
286 # Use FTP protocol
287 def open_ftp(self, url):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000288 host, path = splithost(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000289 if not host: raise IOError, ('ftp error', 'no host given')
290 host, port = splitport(host)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000291 user, host = splituser(host)
292 if user: user, passwd = splitpasswd(user)
293 else: passwd = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000294 host = socket.gethostbyname(host)
295 if not port:
296 import ftplib
297 port = ftplib.FTP_PORT
Guido van Rossum7c395db1994-07-04 22:14:49 +0000298 path, attrs = splitattr(path)
299 dirs = string.splitfields(path, '/')
300 dirs, file = dirs[:-1], dirs[-1]
301 if dirs and not dirs[0]: dirs = dirs[1:]
302 key = (user, host, port, string.joinfields(dirs, '/'))
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000303 try:
304 if not self.ftpcache.has_key(key):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000305 self.ftpcache[key] = \
306 ftpwrapper(user, passwd,
307 host, port, dirs)
308 if not file: type = 'D'
309 else: type = 'I'
310 for attr in attrs:
311 attr, value = splitvalue(attr)
312 if string.lower(attr) == 'type' and \
313 value in ('a', 'A', 'i', 'I', 'd', 'D'):
314 type = string.upper(value)
315 return addinfo(self.ftpcache[key].retrfile(file, type),
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000316 noheaders())
317 except ftperrors(), msg:
318 raise IOError, ('ftp error', msg)
319
320
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000321# Derived class with handlers for errors we can handle (perhaps)
322class FancyURLopener(URLopener):
323
324 def __init__(self, *args):
325 apply(URLopener.__init__, (self,) + args)
326 self.auth_cache = {}
327
328 # Default error handling -- don't raise an exception
329 def http_error_default(self, url, fp, errcode, errmsg, headers):
330 return addinfo(fp, headers)
331
332 # Error 302 -- relocated
333 def http_error_302(self, url, fp, errcode, errmsg, headers):
334 # XXX The server can force infinite recursion here!
335 if headers.has_key('location'):
336 newurl = headers['location']
337 elif headers.has_key('uri'):
338 newurl = headers['uri']
339 else:
340 return
341 void = fp.read()
342 fp.close()
343 return self.open(newurl)
344
345 # Error 401 -- authentication required
346 # See this URL for a description of the basic authentication scheme:
347 # http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt
348 def http_error_401(self, url, fp, errcode, errmsg, headers):
349 if headers.has_key('www-authenticate'):
350 stuff = headers['www-authenticate']
351 p = regex.compile(
352 '[ \t]*\([^ \t]+\)[ \t]+realm="\([^"]*\)"')
353 if p.match(stuff) >= 0:
354 scheme, realm = p.group(1, 2)
355 if string.lower(scheme) == 'basic':
356 return self.retry_http_basic_auth(
357 url, realm)
358
359 def retry_http_basic_auth(self, url, realm):
360 host, selector = splithost(url)
361 i = string.find(host, '@') + 1
362 host = host[i:]
363 user, passwd = self.get_user_passwd(host, realm, i)
364 if not (user or passwd): return None
365 host = user + ':' + passwd + '@' + host
366 newurl = '//' + host + selector
367 return self.open_http(newurl)
368
369 def get_user_passwd(self, host, realm, clear_cache = 0):
370 key = realm + '@' + string.lower(host)
371 if self.auth_cache.has_key(key):
372 if clear_cache:
373 del self.auth_cache[key]
374 else:
375 return self.auth_cache[key]
376 user, passwd = self.prompt_user_passwd(host, realm)
377 if user or passwd: self.auth_cache[key] = (user, passwd)
378 return user, passwd
379
380 def prompt_user_passwd(self, host, realm):
381 # Override this in a GUI environment!
382 try:
383 user = raw_input("Enter username for %s at %s: " %
384 (realm, host))
385 self.echo_off()
386 try:
387 passwd = raw_input(
388 "Enter password for %s in %s at %s: " %
389 (user, realm, host))
390 finally:
391 self.echo_on()
392 return user, passwd
393 except KeyboardInterrupt:
394 return None, None
395
396 def echo_off(self):
397 import os
398 os.system("stty -echo")
399
400 def echo_on(self):
401 import os
402 print
403 os.system("stty echo")
404
405
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000406# Utility functions
407
408# Return the IP address of the magic hostname 'localhost'
409_localhost = None
410def localhost():
411 global _localhost
412 if not _localhost:
413 _localhost = socket.gethostbyname('localhost')
414 return _localhost
415
416# Return the IP address of the current host
417_thishost = None
418def thishost():
419 global _thishost
420 if not _thishost:
421 _thishost = socket.gethostbyname(socket.gethostname())
422 return _thishost
423
424# Return the set of errors raised by the FTP class
425_ftperrors = None
426def ftperrors():
427 global _ftperrors
428 if not _ftperrors:
429 import ftplib
430 _ftperrors = (ftplib.error_reply,
431 ftplib.error_temp,
432 ftplib.error_perm,
433 ftplib.error_proto)
434 return _ftperrors
435
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000436# Return an empty mimetools.Message object
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000437_noheaders = None
438def noheaders():
439 global _noheaders
440 if not _noheaders:
Guido van Rossumbbb0a051995-08-04 04:29:05 +0000441 import mimetools
442 import StringIO
443 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000444 _noheaders.fp.close() # Recycle file descriptor
445 return _noheaders
446
447
448# Utility classes
449
450# Class used by open_ftp() for cache of open FTP connections
451class ftpwrapper:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000452 def __init__(self, user, passwd, host, port, dirs):
453 self.user = unquote(user or '')
454 self.passwd = unquote(passwd or '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000455 self.host = host
456 self.port = port
Guido van Rossum7c395db1994-07-04 22:14:49 +0000457 self.dirs = []
458 for dir in dirs:
459 self.dirs.append(unquote(dir))
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000460 self.init()
461 def init(self):
462 import ftplib
463 self.ftp = ftplib.FTP()
464 self.ftp.connect(self.host, self.port)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000465 self.ftp.login(self.user, self.passwd)
466 for dir in self.dirs:
467 self.ftp.cwd(dir)
468 def retrfile(self, file, type):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000469 import ftplib
Guido van Rossum7c395db1994-07-04 22:14:49 +0000470 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
471 else: cmd = 'TYPE ' + type; isdir = 0
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000472 try:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000473 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000474 except ftplib.all_errors:
475 self.init()
Guido van Rossum7c395db1994-07-04 22:14:49 +0000476 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000477 conn = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000478 if file and not isdir:
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000479 try:
480 cmd = 'RETR ' + file
481 conn = self.ftp.transfercmd(cmd)
482 except ftplib.error_perm, reason:
483 if reason[:3] != '550':
484 raise IOError, ('ftp error', reason)
485 if not conn:
486 # Try a directory listing
487 if file: cmd = 'LIST ' + file
488 else: cmd = 'LIST'
489 conn = self.ftp.transfercmd(cmd)
Jack Jansen0d12ead1996-02-14 16:05:20 +0000490 return addclosehook(conn.makefile('rb'), self.ftp.voidresp)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000491
492# Base class for addinfo and addclosehook
493class addbase:
494 def __init__(self, fp):
495 self.fp = fp
496 self.read = self.fp.read
497 self.readline = self.fp.readline
498 self.readlines = self.fp.readlines
499 self.fileno = self.fp.fileno
500 def __repr__(self):
501 return '<%s at %s whose fp = %s>' % (
502 self.__class__.__name__, `id(self)`, `self.fp`)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000503 def close(self):
504 self.read = None
505 self.readline = None
506 self.readlines = None
507 self.fileno = None
Guido van Rossum6cb15a01995-06-22 19:00:13 +0000508 if self.fp: self.fp.close()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000509 self.fp = None
510
511# Class to add a close hook to an open file
512class addclosehook(addbase):
513 def __init__(self, fp, closehook, *hookargs):
514 addbase.__init__(self, fp)
515 self.closehook = closehook
516 self.hookargs = hookargs
517 def close(self):
518 if self.closehook:
519 apply(self.closehook, self.hookargs)
520 self.closehook = None
521 self.hookargs = None
522 addbase.close(self)
523
524# class to add an info() method to an open file
525class addinfo(addbase):
526 def __init__(self, fp, headers):
527 addbase.__init__(self, fp)
528 self.headers = headers
529 def info(self):
530 return self.headers
531
532
533# Utility to combine a URL with a base URL to form a new URL
534
535def basejoin(base, url):
536 type, path = splittype(url)
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000537 if type:
538 # if url is complete (i.e., it contains a type), return it
539 return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000540 host, path = splithost(path)
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000541 type, basepath = splittype(base) # inherit type from base
542 if host:
543 # if url contains host, just inherit type
544 if type: return type + '://' + host + path
545 else:
546 # no type inherited, so url must have started with //
547 # just return it
548 return url
549 host, basepath = splithost(basepath) # inherit host
550 basepath, basetag = splittag(basepath) # remove extraneuous cruft
551 basepath, basequery = splitquery(basepath) # idem
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000552 if path[:1] != '/':
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000553 # non-absolute path name
554 if path[:1] in ('#', '?'):
555 # path is just a tag or query, attach to basepath
556 i = len(basepath)
557 else:
558 # else replace last component
559 i = string.rfind(basepath, '/')
560 if i < 0:
561 # basepath not absolute
562 if host:
563 # host present, make absolute
564 basepath = '/'
565 else:
566 # else keep non-absolute
567 basepath = ''
568 else:
569 # remove last file component
570 basepath = basepath[:i+1]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000571 path = basepath + path
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000572 if type and host: return type + '://' + host + path
573 elif type: return type + ':' + path
574 elif host: return '//' + host + path # don't know what this means
575 else: return path
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000576
577
Guido van Rossum7c395db1994-07-04 22:14:49 +0000578# Utilities to parse URLs (most of these return None for missing parts):
Sjoerd Mullendere0371b81995-11-10 10:36:07 +0000579# unwrap('<URL:type://host/path>') --> 'type://host/path'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000580# splittype('type:opaquestring') --> 'type', 'opaquestring'
581# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000582# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
583# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000584# splitport('host:port') --> 'host', 'port'
585# splitquery('/path?query') --> '/path', 'query'
586# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000587# splitattr('/path;attr1=value1;attr2=value2;...') ->
588# '/path', ['attr1=value1', 'attr2=value2', ...]
589# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000590# splitgophertype('/Xselector') --> 'X', 'selector'
591# unquote('abc%20def') -> 'abc def'
592# quote('abc def') -> 'abc%20def')
593
594def unwrap(url):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000595 url = string.strip(url)
596 if url[:1] == '<' and url[-1:] == '>':
597 url = string.strip(url[1:-1])
598 if url[:4] == 'URL:': url = string.strip(url[4:])
599 return url
600
601_typeprog = regex.compile('^\([^/:]+\):\(.*\)$')
602def splittype(url):
603 if _typeprog.match(url) >= 0: return _typeprog.group(1, 2)
604 return None, url
605
606_hostprog = regex.compile('^//\([^/]+\)\(.*\)$')
607def splithost(url):
608 if _hostprog.match(url) >= 0: return _hostprog.group(1, 2)
609 return None, url
610
Guido van Rossum7c395db1994-07-04 22:14:49 +0000611_userprog = regex.compile('^\([^@]*\)@\(.*\)$')
612def splituser(host):
613 if _userprog.match(host) >= 0: return _userprog.group(1, 2)
614 return None, host
615
616_passwdprog = regex.compile('^\([^:]*\):\(.*\)$')
617def splitpasswd(user):
618 if _passwdprog.match(user) >= 0: return _passwdprog.group(1, 2)
619 return user, None
620
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000621_portprog = regex.compile('^\(.*\):\([0-9]+\)$')
622def splitport(host):
623 if _portprog.match(host) >= 0: return _portprog.group(1, 2)
624 return host, None
625
626_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$')
627def splitquery(url):
628 if _queryprog.match(url) >= 0: return _queryprog.group(1, 2)
629 return url, None
630
631_tagprog = regex.compile('^\(.*\)#\([^#]*\)$')
632def splittag(url):
633 if _tagprog.match(url) >= 0: return _tagprog.group(1, 2)
634 return url, None
635
Guido van Rossum7c395db1994-07-04 22:14:49 +0000636def splitattr(url):
637 words = string.splitfields(url, ';')
638 return words[0], words[1:]
639
640_valueprog = regex.compile('^\([^=]*\)=\(.*\)$')
641def splitvalue(attr):
642 if _valueprog.match(attr) >= 0: return _valueprog.group(1, 2)
643 return attr, None
644
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000645def splitgophertype(selector):
646 if selector[:1] == '/' and selector[1:2]:
647 return selector[1], selector[2:]
648 return None, selector
649
650_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]')
651def unquote(s):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000652 i = 0
653 n = len(s)
654 res = ''
655 while 0 <= i < n:
656 j = _quoteprog.search(s, i)
657 if j < 0:
658 res = res + s[i:]
659 break
Guido van Rossum8c8a02a1996-01-26 17:41:44 +0000660 res = res + (s[i:j] + chr(string.atoi(s[j+1:j+3], 16)))
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000661 i = j+3
662 return res
663
Guido van Rossum3bb54481994-08-29 10:52:58 +0000664always_safe = string.letters + string.digits + '_,.-'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000665def quote(s, safe = '/'):
666 safe = always_safe + safe
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000667 res = ''
668 for c in s:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000669 if c in safe:
670 res = res + c
671 else:
672 res = res + '%%%02x' % ord(c)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000673 return res
674
675# Test and time quote() and unquote()
676def test1():
677 import time
678 s = ''
679 for i in range(256): s = s + chr(i)
680 s = s*4
681 t0 = time.time()
682 qs = quote(s)
683 uqs = unquote(qs)
684 t1 = time.time()
685 if uqs != s:
686 print 'Wrong!'
687 print `s`
688 print `qs`
689 print `uqs`
690 print round(t1 - t0, 3), 'sec'
691
692
693# Test program
694def test():
695 import sys
696 import regsub
697 args = sys.argv[1:]
698 if not args:
699 args = [
700 '/etc/passwd',
701 'file:/etc/passwd',
702 'file://localhost/etc/passwd',
703 'ftp://ftp.cwi.nl/etc/passwd',
704 'gopher://gopher.cwi.nl/11/',
705 'http://www.cwi.nl/index.html',
706 ]
707 try:
708 for url in args:
709 print '-'*10, url, '-'*10
710 fn, h = urlretrieve(url)
711 print fn, h
712 if h:
713 print '======'
714 for k in h.keys(): print k + ':', h[k]
715 print '======'
716 fp = open(fn, 'r')
717 data = fp.read()
718 del fp
719 print regsub.gsub('\r', '', data)
720 fn, h = None, None
721 print '-'*40
722 finally:
723 urlcleanup()
724
725# Run test program when run as a script
726if __name__ == '__main__':
Guido van Rossum7c395db1994-07-04 22:14:49 +0000727## test1()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000728 test()