blob: c49f032066bf1fc34db6e3261d41abf114b65378 [file] [log] [blame]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001# Open an arbitrary URL
2#
3# See the following document for a tentative description of URLs:
4# Uniform Resource Locators Tim Berners-Lee
5# INTERNET DRAFT CERN
6# IETF URL Working Group 14 July 1993
7# draft-ietf-uri-url-01.txt
8#
9# The object returned by URLopener().open(file) will differ per
10# protocol. All you know is that is has methods read(), readline(),
11# readlines(), fileno(), close() and info(). The read*(), fileno()
12# and close() methods work like those of open files.
13# The info() method returns an rfc822.Message object which can be
14# used to query various info about the object, if available.
15# (rfc822.Message objects are queried with the getheader() method.)
16
Guido van Rossum7c395db1994-07-04 22:14:49 +000017import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000018import socket
19import regex
20
21
22# This really consists of two pieces:
23# (1) a class which handles opening of all sorts of URLs
24# (plus assorted utilities etc.)
25# (2) a set of functions for parsing URLs
26# XXX Should these be separated out into different modules?
27
28
29# Shortcut for basic usage
30_urlopener = None
31def urlopen(url):
32 global _urlopener
33 if not _urlopener:
34 _urlopener = URLopener()
35 return _urlopener.open(url)
36def urlretrieve(url):
37 global _urlopener
38 if not _urlopener:
39 _urlopener = URLopener()
40 return _urlopener.retrieve(url)
41def urlcleanup():
42 if _urlopener:
43 _urlopener.cleanup()
44
45
46# Class to open URLs.
47# This is a class rather than just a subroutine because we may need
48# more than one set of global protocol-specific options.
49ftpcache = {}
50class URLopener:
51
52 # Constructor
53 def __init__(self):
54 self.addheaders = []
Guido van Rossum7aeb4b91994-08-23 13:32:20 +000055 self.tempcache = None
56 # Undocumented feature: if you assign {} to tempcache,
57 # it is used to cache files retrieved with
58 # self.retrieve(). This is not enabled by default
59 # since it does not work for changing documents (and I
60 # haven't got the logic to check expiration headers
61 # yet).
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000062 self.ftpcache = ftpcache
63 # Undocumented feature: you can use a different
64 # ftp cache by assigning to the .ftpcache member;
65 # in case you want logically independent URL openers
66
67 def __del__(self):
68 self.close()
69
70 def close(self):
71 self.cleanup()
72
73 def cleanup(self):
74 import os
Guido van Rossum7aeb4b91994-08-23 13:32:20 +000075 if self.tempcache:
76 for url in self.tempcache.keys():
77 try:
78 os.unlink(self.tempcache[url][0])
79 except os.error:
80 pass
81 del self.tempcache[url]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000082
83 # Add a header to be used by the HTTP interface only
84 # e.g. u.addheader('Accept', 'sound/basic')
85 def addheader(self, *args):
86 self.addheaders.append(args)
87
88 # External interface
89 # Use URLopener().open(file) instead of open(file, 'r')
90 def open(self, url):
91 type, url = splittype(unwrap(url))
92 if not type: type = 'file'
93 name = 'open_' + type
94 if '-' in name:
95 import regsub
96 name = regsub.gsub('-', '_', name)
97 if not hasattr(self, name):
98 raise IOError, ('url error', 'unknown url type', type)
99 try:
100 return getattr(self, name)(url)
101 except socket.error, msg:
102 raise IOError, ('socket error', msg)
103
104 # External interface
105 # retrieve(url) returns (filename, None) for a local object
106 # or (tempfilename, headers) for a remote object
107 def retrieve(self, url):
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000108 if self.tempcache and self.tempcache.has_key(url):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000109 return self.tempcache[url]
110 url1 = unwrap(url)
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000111 if self.tempcache and self.tempcache.has_key(url1):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000112 self.tempcache[url] = self.tempcache[url1]
113 return self.tempcache[url1]
114 type, url1 = splittype(url1)
115 if not type or type == 'file':
116 try:
117 fp = self.open_local_file(url1)
118 del fp
119 return splithost(url1)[1], None
120 except IOError, msg:
121 pass
122 fp = self.open(url)
123 headers = fp.info()
124 import tempfile
125 tfn = tempfile.mktemp()
Guido van Rossumfa59e831994-09-21 11:36:19 +0000126 result = tfn, headers
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000127 if self.tempcache is not None:
Guido van Rossumfa59e831994-09-21 11:36:19 +0000128 self.tempcache[url] = result
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000129 tfp = open(tfn, 'w')
130 bs = 1024*8
131 block = fp.read(bs)
132 while block:
133 tfp.write(block)
134 block = fp.read(bs)
135 del fp
136 del tfp
137 return result
138
139 # Each method named open_<type> knows how to open that type of URL
140
141 # Use HTTP protocol
142 def open_http(self, url):
143 import httplib
144 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000145 if not host: raise IOError, ('http error', 'no host given')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000146 h = httplib.HTTP(host)
147 h.putrequest('GET', selector)
148 for args in self.addheaders: apply(h.putheader, args)
149 errcode, errmsg, headers = h.getreply()
150 if errcode == 200: return addinfo(h.getfile(), headers)
151 else: raise IOError, ('http error', errcode, errmsg, headers)
152
153 # Use Gopher protocol
154 def open_gopher(self, url):
155 import gopherlib
156 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000157 if not host: raise IOError, ('gopher error', 'no host given')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000158 type, selector = splitgophertype(selector)
159 selector, query = splitquery(selector)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000160 selector = unquote(selector)
161 if query:
162 query = unquote(query)
163 fp = gopherlib.send_query(selector, query, host)
164 else:
165 fp = gopherlib.send_selector(selector, host)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000166 return addinfo(fp, noheaders())
167
168 # Use local file or FTP depending on form of URL
169 def open_file(self, url):
170 try:
171 return self.open_local_file(url)
172 except IOError:
173 return self.open_ftp(url)
174
175 # Use local file
176 def open_local_file(self, url):
177 host, file = splithost(url)
178 if not host: return addinfo(open(file, 'r'), noheaders())
179 host, port = splitport(host)
180 if not port and socket.gethostbyname(host) in (
181 localhost(), thishost()):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000182 file = unquote(file)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000183 return addinfo(open(file, 'r'), noheaders())
184 raise IOError, ('local file error', 'not on local host')
185
186 # Use FTP protocol
187 def open_ftp(self, url):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000188 host, path = splithost(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000189 if not host: raise IOError, ('ftp error', 'no host given')
190 host, port = splitport(host)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000191 user, host = splituser(host)
192 if user: user, passwd = splitpasswd(user)
193 else: passwd = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000194 host = socket.gethostbyname(host)
195 if not port:
196 import ftplib
197 port = ftplib.FTP_PORT
Guido van Rossum7c395db1994-07-04 22:14:49 +0000198 path, attrs = splitattr(path)
199 dirs = string.splitfields(path, '/')
200 dirs, file = dirs[:-1], dirs[-1]
201 if dirs and not dirs[0]: dirs = dirs[1:]
202 key = (user, host, port, string.joinfields(dirs, '/'))
Guido van Rossum3f9a6ec1994-08-12 13:16:50 +0000203## print 'key =', key
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000204 try:
205 if not self.ftpcache.has_key(key):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000206 self.ftpcache[key] = \
207 ftpwrapper(user, passwd,
208 host, port, dirs)
209 if not file: type = 'D'
210 else: type = 'I'
211 for attr in attrs:
212 attr, value = splitvalue(attr)
213 if string.lower(attr) == 'type' and \
214 value in ('a', 'A', 'i', 'I', 'd', 'D'):
215 type = string.upper(value)
216 return addinfo(self.ftpcache[key].retrfile(file, type),
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000217 noheaders())
218 except ftperrors(), msg:
219 raise IOError, ('ftp error', msg)
220
221
222# Utility functions
223
224# Return the IP address of the magic hostname 'localhost'
225_localhost = None
226def localhost():
227 global _localhost
228 if not _localhost:
229 _localhost = socket.gethostbyname('localhost')
230 return _localhost
231
232# Return the IP address of the current host
233_thishost = None
234def thishost():
235 global _thishost
236 if not _thishost:
237 _thishost = socket.gethostbyname(socket.gethostname())
238 return _thishost
239
240# Return the set of errors raised by the FTP class
241_ftperrors = None
242def ftperrors():
243 global _ftperrors
244 if not _ftperrors:
245 import ftplib
246 _ftperrors = (ftplib.error_reply,
247 ftplib.error_temp,
248 ftplib.error_perm,
249 ftplib.error_proto)
250 return _ftperrors
251
252# Return an empty rfc822.Message object
253_noheaders = None
254def noheaders():
255 global _noheaders
256 if not _noheaders:
257 import rfc822
258 _noheaders = rfc822.Message(open('/dev/null', 'r'))
259 _noheaders.fp.close() # Recycle file descriptor
260 return _noheaders
261
262
263# Utility classes
264
265# Class used by open_ftp() for cache of open FTP connections
266class ftpwrapper:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000267 def __init__(self, user, passwd, host, port, dirs):
268 self.user = unquote(user or '')
269 self.passwd = unquote(passwd or '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000270 self.host = host
271 self.port = port
Guido van Rossum7c395db1994-07-04 22:14:49 +0000272 self.dirs = []
273 for dir in dirs:
274 self.dirs.append(unquote(dir))
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000275 self.init()
276 def init(self):
277 import ftplib
278 self.ftp = ftplib.FTP()
279 self.ftp.connect(self.host, self.port)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000280 self.ftp.login(self.user, self.passwd)
281 for dir in self.dirs:
282 self.ftp.cwd(dir)
283 def retrfile(self, file, type):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000284 import ftplib
Guido van Rossum7c395db1994-07-04 22:14:49 +0000285 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
286 else: cmd = 'TYPE ' + type; isdir = 0
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000287 try:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000288 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000289 except ftplib.all_errors:
290 self.init()
Guido van Rossum7c395db1994-07-04 22:14:49 +0000291 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000292 conn = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000293 if file and not isdir:
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000294 try:
295 cmd = 'RETR ' + file
296 conn = self.ftp.transfercmd(cmd)
297 except ftplib.error_perm, reason:
298 if reason[:3] != '550':
299 raise IOError, ('ftp error', reason)
300 if not conn:
301 # Try a directory listing
302 if file: cmd = 'LIST ' + file
303 else: cmd = 'LIST'
304 conn = self.ftp.transfercmd(cmd)
305 return addclosehook(conn.makefile('r'), self.ftp.voidresp)
306
307# Base class for addinfo and addclosehook
308class addbase:
309 def __init__(self, fp):
310 self.fp = fp
311 self.read = self.fp.read
312 self.readline = self.fp.readline
313 self.readlines = self.fp.readlines
314 self.fileno = self.fp.fileno
315 def __repr__(self):
316 return '<%s at %s whose fp = %s>' % (
317 self.__class__.__name__, `id(self)`, `self.fp`)
318 def __del__(self):
319 self.close()
320 def close(self):
321 self.read = None
322 self.readline = None
323 self.readlines = None
324 self.fileno = None
325 self.fp = None
326
327# Class to add a close hook to an open file
328class addclosehook(addbase):
329 def __init__(self, fp, closehook, *hookargs):
330 addbase.__init__(self, fp)
331 self.closehook = closehook
332 self.hookargs = hookargs
333 def close(self):
334 if self.closehook:
335 apply(self.closehook, self.hookargs)
336 self.closehook = None
337 self.hookargs = None
338 addbase.close(self)
339
340# class to add an info() method to an open file
341class addinfo(addbase):
342 def __init__(self, fp, headers):
343 addbase.__init__(self, fp)
344 self.headers = headers
345 def info(self):
346 return self.headers
347
348
349# Utility to combine a URL with a base URL to form a new URL
350
351def basejoin(base, url):
352 type, path = splittype(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000353 host, path = splithost(path)
Guido van Rossuma1124701994-12-30 17:18:59 +0000354 if type and host: return url
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000355 basetype, basepath = splittype(base)
356 basehost, basepath = splithost(basepath)
357 basepath, basetag = splittag(basepath)
358 basepath, basequery = splitquery(basepath)
Guido van Rossuma1124701994-12-30 17:18:59 +0000359 if not type: type = basetype or 'file'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000360 if path[:1] != '/':
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000361 i = string.rfind(basepath, '/')
362 if i < 0: basepath = '/'
363 else: basepath = basepath[:i+1]
364 path = basepath + path
365 if not host: host = basehost
366 if host: return type + '://' + host + path
367 else: return type + ':' + path
368
369
Guido van Rossum7c395db1994-07-04 22:14:49 +0000370# Utilities to parse URLs (most of these return None for missing parts):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000371# unwrap('<URL:type//host/path>') --> 'type//host/path'
372# splittype('type:opaquestring') --> 'type', 'opaquestring'
373# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000374# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
375# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000376# splitport('host:port') --> 'host', 'port'
377# splitquery('/path?query') --> '/path', 'query'
378# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000379# splitattr('/path;attr1=value1;attr2=value2;...') ->
380# '/path', ['attr1=value1', 'attr2=value2', ...]
381# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000382# splitgophertype('/Xselector') --> 'X', 'selector'
383# unquote('abc%20def') -> 'abc def'
384# quote('abc def') -> 'abc%20def')
385
386def unwrap(url):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000387 url = string.strip(url)
388 if url[:1] == '<' and url[-1:] == '>':
389 url = string.strip(url[1:-1])
390 if url[:4] == 'URL:': url = string.strip(url[4:])
391 return url
392
393_typeprog = regex.compile('^\([^/:]+\):\(.*\)$')
394def splittype(url):
395 if _typeprog.match(url) >= 0: return _typeprog.group(1, 2)
396 return None, url
397
398_hostprog = regex.compile('^//\([^/]+\)\(.*\)$')
399def splithost(url):
400 if _hostprog.match(url) >= 0: return _hostprog.group(1, 2)
401 return None, url
402
Guido van Rossum7c395db1994-07-04 22:14:49 +0000403_userprog = regex.compile('^\([^@]*\)@\(.*\)$')
404def splituser(host):
405 if _userprog.match(host) >= 0: return _userprog.group(1, 2)
406 return None, host
407
408_passwdprog = regex.compile('^\([^:]*\):\(.*\)$')
409def splitpasswd(user):
410 if _passwdprog.match(user) >= 0: return _passwdprog.group(1, 2)
411 return user, None
412
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000413_portprog = regex.compile('^\(.*\):\([0-9]+\)$')
414def splitport(host):
415 if _portprog.match(host) >= 0: return _portprog.group(1, 2)
416 return host, None
417
418_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$')
419def splitquery(url):
420 if _queryprog.match(url) >= 0: return _queryprog.group(1, 2)
421 return url, None
422
423_tagprog = regex.compile('^\(.*\)#\([^#]*\)$')
424def splittag(url):
425 if _tagprog.match(url) >= 0: return _tagprog.group(1, 2)
426 return url, None
427
Guido van Rossum7c395db1994-07-04 22:14:49 +0000428def splitattr(url):
429 words = string.splitfields(url, ';')
430 return words[0], words[1:]
431
432_valueprog = regex.compile('^\([^=]*\)=\(.*\)$')
433def splitvalue(attr):
434 if _valueprog.match(attr) >= 0: return _valueprog.group(1, 2)
435 return attr, None
436
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000437def splitgophertype(selector):
438 if selector[:1] == '/' and selector[1:2]:
439 return selector[1], selector[2:]
440 return None, selector
441
442_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]')
443def unquote(s):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000444 i = 0
445 n = len(s)
446 res = ''
447 while 0 <= i < n:
448 j = _quoteprog.search(s, i)
449 if j < 0:
450 res = res + s[i:]
451 break
452 res = res + (s[i:j] + chr(eval('0x' + s[j+1:j+3])))
453 i = j+3
454 return res
455
Guido van Rossum3bb54481994-08-29 10:52:58 +0000456always_safe = string.letters + string.digits + '_,.-'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000457def quote(s, safe = '/'):
458 safe = always_safe + safe
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000459 res = ''
460 for c in s:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000461 if c in safe:
462 res = res + c
463 else:
464 res = res + '%%%02x' % ord(c)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000465 return res
466
467# Test and time quote() and unquote()
468def test1():
469 import time
470 s = ''
471 for i in range(256): s = s + chr(i)
472 s = s*4
473 t0 = time.time()
474 qs = quote(s)
475 uqs = unquote(qs)
476 t1 = time.time()
477 if uqs != s:
478 print 'Wrong!'
479 print `s`
480 print `qs`
481 print `uqs`
482 print round(t1 - t0, 3), 'sec'
483
484
485# Test program
486def test():
487 import sys
488 import regsub
489 args = sys.argv[1:]
490 if not args:
491 args = [
492 '/etc/passwd',
493 'file:/etc/passwd',
494 'file://localhost/etc/passwd',
495 'ftp://ftp.cwi.nl/etc/passwd',
496 'gopher://gopher.cwi.nl/11/',
497 'http://www.cwi.nl/index.html',
498 ]
499 try:
500 for url in args:
501 print '-'*10, url, '-'*10
502 fn, h = urlretrieve(url)
503 print fn, h
504 if h:
505 print '======'
506 for k in h.keys(): print k + ':', h[k]
507 print '======'
508 fp = open(fn, 'r')
509 data = fp.read()
510 del fp
511 print regsub.gsub('\r', '', data)
512 fn, h = None, None
513 print '-'*40
514 finally:
515 urlcleanup()
516
517# Run test program when run as a script
518if __name__ == '__main__':
Guido van Rossum7c395db1994-07-04 22:14:49 +0000519## test1()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000520 test()