blob: a4a891fc4d525843a76b92ae96921b1886c93b41 [file] [log] [blame]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001# Open an arbitrary URL
2#
3# See the following document for a tentative description of URLs:
4# Uniform Resource Locators Tim Berners-Lee
5# INTERNET DRAFT CERN
6# IETF URL Working Group 14 July 1993
7# draft-ietf-uri-url-01.txt
8#
9# The object returned by URLopener().open(file) will differ per
10# protocol. All you know is that is has methods read(), readline(),
11# readlines(), fileno(), close() and info(). The read*(), fileno()
12# and close() methods work like those of open files.
13# The info() method returns an rfc822.Message object which can be
14# used to query various info about the object, if available.
15# (rfc822.Message objects are queried with the getheader() method.)
16
Guido van Rossum7c395db1994-07-04 22:14:49 +000017import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000018import socket
19import regex
20
21
22# This really consists of two pieces:
23# (1) a class which handles opening of all sorts of URLs
24# (plus assorted utilities etc.)
25# (2) a set of functions for parsing URLs
26# XXX Should these be separated out into different modules?
27
28
29# Shortcut for basic usage
30_urlopener = None
31def urlopen(url):
32 global _urlopener
33 if not _urlopener:
34 _urlopener = URLopener()
35 return _urlopener.open(url)
36def urlretrieve(url):
37 global _urlopener
38 if not _urlopener:
39 _urlopener = URLopener()
40 return _urlopener.retrieve(url)
41def urlcleanup():
42 if _urlopener:
43 _urlopener.cleanup()
44
45
46# Class to open URLs.
47# This is a class rather than just a subroutine because we may need
48# more than one set of global protocol-specific options.
49ftpcache = {}
50class URLopener:
51
52 # Constructor
53 def __init__(self):
54 self.addheaders = []
Guido van Rossum7aeb4b91994-08-23 13:32:20 +000055 self.tempcache = None
56 # Undocumented feature: if you assign {} to tempcache,
57 # it is used to cache files retrieved with
58 # self.retrieve(). This is not enabled by default
59 # since it does not work for changing documents (and I
60 # haven't got the logic to check expiration headers
61 # yet).
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000062 self.ftpcache = ftpcache
63 # Undocumented feature: you can use a different
64 # ftp cache by assigning to the .ftpcache member;
65 # in case you want logically independent URL openers
66
67 def __del__(self):
68 self.close()
69
70 def close(self):
71 self.cleanup()
72
73 def cleanup(self):
74 import os
Guido van Rossum7aeb4b91994-08-23 13:32:20 +000075 if self.tempcache:
76 for url in self.tempcache.keys():
77 try:
78 os.unlink(self.tempcache[url][0])
79 except os.error:
80 pass
81 del self.tempcache[url]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000082
83 # Add a header to be used by the HTTP interface only
84 # e.g. u.addheader('Accept', 'sound/basic')
85 def addheader(self, *args):
86 self.addheaders.append(args)
87
88 # External interface
89 # Use URLopener().open(file) instead of open(file, 'r')
90 def open(self, url):
91 type, url = splittype(unwrap(url))
92 if not type: type = 'file'
93 name = 'open_' + type
94 if '-' in name:
95 import regsub
96 name = regsub.gsub('-', '_', name)
97 if not hasattr(self, name):
98 raise IOError, ('url error', 'unknown url type', type)
99 try:
100 return getattr(self, name)(url)
101 except socket.error, msg:
102 raise IOError, ('socket error', msg)
103
104 # External interface
105 # retrieve(url) returns (filename, None) for a local object
106 # or (tempfilename, headers) for a remote object
107 def retrieve(self, url):
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000108 if self.tempcache and self.tempcache.has_key(url):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000109 return self.tempcache[url]
110 url1 = unwrap(url)
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000111 if self.tempcache and self.tempcache.has_key(url1):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000112 self.tempcache[url] = self.tempcache[url1]
113 return self.tempcache[url1]
114 type, url1 = splittype(url1)
115 if not type or type == 'file':
116 try:
117 fp = self.open_local_file(url1)
118 del fp
119 return splithost(url1)[1], None
120 except IOError, msg:
121 pass
122 fp = self.open(url)
123 headers = fp.info()
124 import tempfile
125 tfn = tempfile.mktemp()
Guido van Rossum7aeb4b91994-08-23 13:32:20 +0000126 if self.tempcache is not None:
127 self.tempcache[url] = result = tfn, headers
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000128 tfp = open(tfn, 'w')
129 bs = 1024*8
130 block = fp.read(bs)
131 while block:
132 tfp.write(block)
133 block = fp.read(bs)
134 del fp
135 del tfp
136 return result
137
138 # Each method named open_<type> knows how to open that type of URL
139
140 # Use HTTP protocol
141 def open_http(self, url):
142 import httplib
143 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000144 if not host: raise IOError, ('http error', 'no host given')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000145 h = httplib.HTTP(host)
146 h.putrequest('GET', selector)
147 for args in self.addheaders: apply(h.putheader, args)
148 errcode, errmsg, headers = h.getreply()
149 if errcode == 200: return addinfo(h.getfile(), headers)
150 else: raise IOError, ('http error', errcode, errmsg, headers)
151
152 # Use Gopher protocol
153 def open_gopher(self, url):
154 import gopherlib
155 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000156 if not host: raise IOError, ('gopher error', 'no host given')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000157 type, selector = splitgophertype(selector)
158 selector, query = splitquery(selector)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000159 selector = unquote(selector)
160 if query:
161 query = unquote(query)
162 fp = gopherlib.send_query(selector, query, host)
163 else:
164 fp = gopherlib.send_selector(selector, host)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000165 return addinfo(fp, noheaders())
166
167 # Use local file or FTP depending on form of URL
168 def open_file(self, url):
169 try:
170 return self.open_local_file(url)
171 except IOError:
172 return self.open_ftp(url)
173
174 # Use local file
175 def open_local_file(self, url):
176 host, file = splithost(url)
177 if not host: return addinfo(open(file, 'r'), noheaders())
178 host, port = splitport(host)
179 if not port and socket.gethostbyname(host) in (
180 localhost(), thishost()):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000181 file = unquote(file)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000182 return addinfo(open(file, 'r'), noheaders())
183 raise IOError, ('local file error', 'not on local host')
184
185 # Use FTP protocol
186 def open_ftp(self, url):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000187 host, path = splithost(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000188 if not host: raise IOError, ('ftp error', 'no host given')
189 host, port = splitport(host)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000190 user, host = splituser(host)
191 if user: user, passwd = splitpasswd(user)
192 else: passwd = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000193 host = socket.gethostbyname(host)
194 if not port:
195 import ftplib
196 port = ftplib.FTP_PORT
Guido van Rossum7c395db1994-07-04 22:14:49 +0000197 path, attrs = splitattr(path)
198 dirs = string.splitfields(path, '/')
199 dirs, file = dirs[:-1], dirs[-1]
200 if dirs and not dirs[0]: dirs = dirs[1:]
201 key = (user, host, port, string.joinfields(dirs, '/'))
Guido van Rossum3f9a6ec1994-08-12 13:16:50 +0000202## print 'key =', key
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000203 try:
204 if not self.ftpcache.has_key(key):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000205 self.ftpcache[key] = \
206 ftpwrapper(user, passwd,
207 host, port, dirs)
208 if not file: type = 'D'
209 else: type = 'I'
210 for attr in attrs:
211 attr, value = splitvalue(attr)
212 if string.lower(attr) == 'type' and \
213 value in ('a', 'A', 'i', 'I', 'd', 'D'):
214 type = string.upper(value)
215 return addinfo(self.ftpcache[key].retrfile(file, type),
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000216 noheaders())
217 except ftperrors(), msg:
218 raise IOError, ('ftp error', msg)
219
220
221# Utility functions
222
223# Return the IP address of the magic hostname 'localhost'
224_localhost = None
225def localhost():
226 global _localhost
227 if not _localhost:
228 _localhost = socket.gethostbyname('localhost')
229 return _localhost
230
231# Return the IP address of the current host
232_thishost = None
233def thishost():
234 global _thishost
235 if not _thishost:
236 _thishost = socket.gethostbyname(socket.gethostname())
237 return _thishost
238
239# Return the set of errors raised by the FTP class
240_ftperrors = None
241def ftperrors():
242 global _ftperrors
243 if not _ftperrors:
244 import ftplib
245 _ftperrors = (ftplib.error_reply,
246 ftplib.error_temp,
247 ftplib.error_perm,
248 ftplib.error_proto)
249 return _ftperrors
250
251# Return an empty rfc822.Message object
252_noheaders = None
253def noheaders():
254 global _noheaders
255 if not _noheaders:
256 import rfc822
257 _noheaders = rfc822.Message(open('/dev/null', 'r'))
258 _noheaders.fp.close() # Recycle file descriptor
259 return _noheaders
260
261
262# Utility classes
263
264# Class used by open_ftp() for cache of open FTP connections
265class ftpwrapper:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000266 def __init__(self, user, passwd, host, port, dirs):
267 self.user = unquote(user or '')
268 self.passwd = unquote(passwd or '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000269 self.host = host
270 self.port = port
Guido van Rossum7c395db1994-07-04 22:14:49 +0000271 self.dirs = []
272 for dir in dirs:
273 self.dirs.append(unquote(dir))
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000274 self.init()
275 def init(self):
276 import ftplib
277 self.ftp = ftplib.FTP()
278 self.ftp.connect(self.host, self.port)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000279 self.ftp.login(self.user, self.passwd)
280 for dir in self.dirs:
281 self.ftp.cwd(dir)
282 def retrfile(self, file, type):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000283 import ftplib
Guido van Rossum7c395db1994-07-04 22:14:49 +0000284 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
285 else: cmd = 'TYPE ' + type; isdir = 0
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000286 try:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000287 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000288 except ftplib.all_errors:
289 self.init()
Guido van Rossum7c395db1994-07-04 22:14:49 +0000290 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000291 conn = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000292 if file and not isdir:
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000293 try:
294 cmd = 'RETR ' + file
295 conn = self.ftp.transfercmd(cmd)
296 except ftplib.error_perm, reason:
297 if reason[:3] != '550':
298 raise IOError, ('ftp error', reason)
299 if not conn:
300 # Try a directory listing
301 if file: cmd = 'LIST ' + file
302 else: cmd = 'LIST'
303 conn = self.ftp.transfercmd(cmd)
304 return addclosehook(conn.makefile('r'), self.ftp.voidresp)
305
306# Base class for addinfo and addclosehook
307class addbase:
308 def __init__(self, fp):
309 self.fp = fp
310 self.read = self.fp.read
311 self.readline = self.fp.readline
312 self.readlines = self.fp.readlines
313 self.fileno = self.fp.fileno
314 def __repr__(self):
315 return '<%s at %s whose fp = %s>' % (
316 self.__class__.__name__, `id(self)`, `self.fp`)
317 def __del__(self):
318 self.close()
319 def close(self):
320 self.read = None
321 self.readline = None
322 self.readlines = None
323 self.fileno = None
324 self.fp = None
325
326# Class to add a close hook to an open file
327class addclosehook(addbase):
328 def __init__(self, fp, closehook, *hookargs):
329 addbase.__init__(self, fp)
330 self.closehook = closehook
331 self.hookargs = hookargs
332 def close(self):
333 if self.closehook:
334 apply(self.closehook, self.hookargs)
335 self.closehook = None
336 self.hookargs = None
337 addbase.close(self)
338
339# class to add an info() method to an open file
340class addinfo(addbase):
341 def __init__(self, fp, headers):
342 addbase.__init__(self, fp)
343 self.headers = headers
344 def info(self):
345 return self.headers
346
347
348# Utility to combine a URL with a base URL to form a new URL
349
350def basejoin(base, url):
351 type, path = splittype(url)
352 if type: return url
353 host, path = splithost(path)
354 basetype, basepath = splittype(base)
355 basehost, basepath = splithost(basepath)
356 basepath, basetag = splittag(basepath)
357 basepath, basequery = splitquery(basepath)
358 type = basetype or 'file'
359 if path[:1] != '/':
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000360 i = string.rfind(basepath, '/')
361 if i < 0: basepath = '/'
362 else: basepath = basepath[:i+1]
363 path = basepath + path
364 if not host: host = basehost
365 if host: return type + '://' + host + path
366 else: return type + ':' + path
367
368
Guido van Rossum7c395db1994-07-04 22:14:49 +0000369# Utilities to parse URLs (most of these return None for missing parts):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000370# unwrap('<URL:type//host/path>') --> 'type//host/path'
371# splittype('type:opaquestring') --> 'type', 'opaquestring'
372# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000373# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
374# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000375# splitport('host:port') --> 'host', 'port'
376# splitquery('/path?query') --> '/path', 'query'
377# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000378# splitattr('/path;attr1=value1;attr2=value2;...') ->
379# '/path', ['attr1=value1', 'attr2=value2', ...]
380# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000381# splitgophertype('/Xselector') --> 'X', 'selector'
382# unquote('abc%20def') -> 'abc def'
383# quote('abc def') -> 'abc%20def')
384
385def unwrap(url):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000386 url = string.strip(url)
387 if url[:1] == '<' and url[-1:] == '>':
388 url = string.strip(url[1:-1])
389 if url[:4] == 'URL:': url = string.strip(url[4:])
390 return url
391
392_typeprog = regex.compile('^\([^/:]+\):\(.*\)$')
393def splittype(url):
394 if _typeprog.match(url) >= 0: return _typeprog.group(1, 2)
395 return None, url
396
397_hostprog = regex.compile('^//\([^/]+\)\(.*\)$')
398def splithost(url):
399 if _hostprog.match(url) >= 0: return _hostprog.group(1, 2)
400 return None, url
401
Guido van Rossum7c395db1994-07-04 22:14:49 +0000402_userprog = regex.compile('^\([^@]*\)@\(.*\)$')
403def splituser(host):
404 if _userprog.match(host) >= 0: return _userprog.group(1, 2)
405 return None, host
406
407_passwdprog = regex.compile('^\([^:]*\):\(.*\)$')
408def splitpasswd(user):
409 if _passwdprog.match(user) >= 0: return _passwdprog.group(1, 2)
410 return user, None
411
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000412_portprog = regex.compile('^\(.*\):\([0-9]+\)$')
413def splitport(host):
414 if _portprog.match(host) >= 0: return _portprog.group(1, 2)
415 return host, None
416
417_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$')
418def splitquery(url):
419 if _queryprog.match(url) >= 0: return _queryprog.group(1, 2)
420 return url, None
421
422_tagprog = regex.compile('^\(.*\)#\([^#]*\)$')
423def splittag(url):
424 if _tagprog.match(url) >= 0: return _tagprog.group(1, 2)
425 return url, None
426
Guido van Rossum7c395db1994-07-04 22:14:49 +0000427def splitattr(url):
428 words = string.splitfields(url, ';')
429 return words[0], words[1:]
430
431_valueprog = regex.compile('^\([^=]*\)=\(.*\)$')
432def splitvalue(attr):
433 if _valueprog.match(attr) >= 0: return _valueprog.group(1, 2)
434 return attr, None
435
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000436def splitgophertype(selector):
437 if selector[:1] == '/' and selector[1:2]:
438 return selector[1], selector[2:]
439 return None, selector
440
441_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]')
442def unquote(s):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000443 i = 0
444 n = len(s)
445 res = ''
446 while 0 <= i < n:
447 j = _quoteprog.search(s, i)
448 if j < 0:
449 res = res + s[i:]
450 break
451 res = res + (s[i:j] + chr(eval('0x' + s[j+1:j+3])))
452 i = j+3
453 return res
454
Guido van Rossum3bb54481994-08-29 10:52:58 +0000455always_safe = string.letters + string.digits + '_,.-'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000456def quote(s, safe = '/'):
457 safe = always_safe + safe
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000458 res = ''
459 for c in s:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000460 if c in safe:
461 res = res + c
462 else:
463 res = res + '%%%02x' % ord(c)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000464 return res
465
466# Test and time quote() and unquote()
467def test1():
468 import time
469 s = ''
470 for i in range(256): s = s + chr(i)
471 s = s*4
472 t0 = time.time()
473 qs = quote(s)
474 uqs = unquote(qs)
475 t1 = time.time()
476 if uqs != s:
477 print 'Wrong!'
478 print `s`
479 print `qs`
480 print `uqs`
481 print round(t1 - t0, 3), 'sec'
482
483
484# Test program
485def test():
486 import sys
487 import regsub
488 args = sys.argv[1:]
489 if not args:
490 args = [
491 '/etc/passwd',
492 'file:/etc/passwd',
493 'file://localhost/etc/passwd',
494 'ftp://ftp.cwi.nl/etc/passwd',
495 'gopher://gopher.cwi.nl/11/',
496 'http://www.cwi.nl/index.html',
497 ]
498 try:
499 for url in args:
500 print '-'*10, url, '-'*10
501 fn, h = urlretrieve(url)
502 print fn, h
503 if h:
504 print '======'
505 for k in h.keys(): print k + ':', h[k]
506 print '======'
507 fp = open(fn, 'r')
508 data = fp.read()
509 del fp
510 print regsub.gsub('\r', '', data)
511 fn, h = None, None
512 print '-'*40
513 finally:
514 urlcleanup()
515
516# Run test program when run as a script
517if __name__ == '__main__':
Guido van Rossum7c395db1994-07-04 22:14:49 +0000518## test1()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000519 test()