blob: 14fcefa4fbc7d7519e4c7b26ca28fc2bd2bcbea6 [file] [log] [blame]
Guido van Rossum7c6ebb51994-03-22 12:05:32 +00001# Open an arbitrary URL
2#
3# See the following document for a tentative description of URLs:
4# Uniform Resource Locators Tim Berners-Lee
5# INTERNET DRAFT CERN
6# IETF URL Working Group 14 July 1993
7# draft-ietf-uri-url-01.txt
8#
9# The object returned by URLopener().open(file) will differ per
10# protocol. All you know is that is has methods read(), readline(),
11# readlines(), fileno(), close() and info(). The read*(), fileno()
12# and close() methods work like those of open files.
13# The info() method returns an rfc822.Message object which can be
14# used to query various info about the object, if available.
15# (rfc822.Message objects are queried with the getheader() method.)
16
Guido van Rossum7c395db1994-07-04 22:14:49 +000017import string
Guido van Rossum7c6ebb51994-03-22 12:05:32 +000018import socket
19import regex
20
21
22# This really consists of two pieces:
23# (1) a class which handles opening of all sorts of URLs
24# (plus assorted utilities etc.)
25# (2) a set of functions for parsing URLs
26# XXX Should these be separated out into different modules?
27
28
29# Shortcut for basic usage
30_urlopener = None
31def urlopen(url):
32 global _urlopener
33 if not _urlopener:
34 _urlopener = URLopener()
35 return _urlopener.open(url)
36def urlretrieve(url):
37 global _urlopener
38 if not _urlopener:
39 _urlopener = URLopener()
40 return _urlopener.retrieve(url)
41def urlcleanup():
42 if _urlopener:
43 _urlopener.cleanup()
44
45
46# Class to open URLs.
47# This is a class rather than just a subroutine because we may need
48# more than one set of global protocol-specific options.
49ftpcache = {}
50class URLopener:
51
52 # Constructor
53 def __init__(self):
54 self.addheaders = []
55 self.tempcache = {}
56 self.ftpcache = ftpcache
57 # Undocumented feature: you can use a different
58 # ftp cache by assigning to the .ftpcache member;
59 # in case you want logically independent URL openers
60
61 def __del__(self):
62 self.close()
63
64 def close(self):
65 self.cleanup()
66
67 def cleanup(self):
68 import os
69 for url in self.tempcache.keys():
70 try:
71 os.unlink(self.tempcache[url][0])
72 except os.error:
73 pass
74 del self.tempcache[url]
75
76 # Add a header to be used by the HTTP interface only
77 # e.g. u.addheader('Accept', 'sound/basic')
78 def addheader(self, *args):
79 self.addheaders.append(args)
80
81 # External interface
82 # Use URLopener().open(file) instead of open(file, 'r')
83 def open(self, url):
84 type, url = splittype(unwrap(url))
85 if not type: type = 'file'
86 name = 'open_' + type
87 if '-' in name:
88 import regsub
89 name = regsub.gsub('-', '_', name)
90 if not hasattr(self, name):
91 raise IOError, ('url error', 'unknown url type', type)
92 try:
93 return getattr(self, name)(url)
94 except socket.error, msg:
95 raise IOError, ('socket error', msg)
96
97 # External interface
98 # retrieve(url) returns (filename, None) for a local object
99 # or (tempfilename, headers) for a remote object
100 def retrieve(self, url):
101 if self.tempcache.has_key(url):
102 return self.tempcache[url]
103 url1 = unwrap(url)
104 if self.tempcache.has_key(url1):
105 self.tempcache[url] = self.tempcache[url1]
106 return self.tempcache[url1]
107 type, url1 = splittype(url1)
108 if not type or type == 'file':
109 try:
110 fp = self.open_local_file(url1)
111 del fp
112 return splithost(url1)[1], None
113 except IOError, msg:
114 pass
115 fp = self.open(url)
116 headers = fp.info()
117 import tempfile
118 tfn = tempfile.mktemp()
119 self.tempcache[url] = result = tfn, headers
120 tfp = open(tfn, 'w')
121 bs = 1024*8
122 block = fp.read(bs)
123 while block:
124 tfp.write(block)
125 block = fp.read(bs)
126 del fp
127 del tfp
128 return result
129
130 # Each method named open_<type> knows how to open that type of URL
131
132 # Use HTTP protocol
133 def open_http(self, url):
134 import httplib
135 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000136 if not host: raise IOError, ('http error', 'no host given')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000137 h = httplib.HTTP(host)
138 h.putrequest('GET', selector)
139 for args in self.addheaders: apply(h.putheader, args)
140 errcode, errmsg, headers = h.getreply()
141 if errcode == 200: return addinfo(h.getfile(), headers)
142 else: raise IOError, ('http error', errcode, errmsg, headers)
143
144 # Use Gopher protocol
145 def open_gopher(self, url):
146 import gopherlib
147 host, selector = splithost(url)
Guido van Rossum590b2891994-04-18 09:39:56 +0000148 if not host: raise IOError, ('gopher error', 'no host given')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000149 type, selector = splitgophertype(selector)
150 selector, query = splitquery(selector)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000151 selector = unquote(selector)
152 if query:
153 query = unquote(query)
154 fp = gopherlib.send_query(selector, query, host)
155 else:
156 fp = gopherlib.send_selector(selector, host)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000157 return addinfo(fp, noheaders())
158
159 # Use local file or FTP depending on form of URL
160 def open_file(self, url):
161 try:
162 return self.open_local_file(url)
163 except IOError:
164 return self.open_ftp(url)
165
166 # Use local file
167 def open_local_file(self, url):
168 host, file = splithost(url)
169 if not host: return addinfo(open(file, 'r'), noheaders())
170 host, port = splitport(host)
171 if not port and socket.gethostbyname(host) in (
172 localhost(), thishost()):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000173 file = unquote(file)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000174 return addinfo(open(file, 'r'), noheaders())
175 raise IOError, ('local file error', 'not on local host')
176
177 # Use FTP protocol
178 def open_ftp(self, url):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000179 host, path = splithost(url)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000180 if not host: raise IOError, ('ftp error', 'no host given')
181 host, port = splitport(host)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000182 user, host = splituser(host)
183 if user: user, passwd = splitpasswd(user)
184 else: passwd = None
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000185 host = socket.gethostbyname(host)
186 if not port:
187 import ftplib
188 port = ftplib.FTP_PORT
Guido van Rossum7c395db1994-07-04 22:14:49 +0000189 path, attrs = splitattr(path)
190 dirs = string.splitfields(path, '/')
191 dirs, file = dirs[:-1], dirs[-1]
192 if dirs and not dirs[0]: dirs = dirs[1:]
193 key = (user, host, port, string.joinfields(dirs, '/'))
194 print 'key =', key
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000195 try:
196 if not self.ftpcache.has_key(key):
Guido van Rossum7c395db1994-07-04 22:14:49 +0000197 self.ftpcache[key] = \
198 ftpwrapper(user, passwd,
199 host, port, dirs)
200 if not file: type = 'D'
201 else: type = 'I'
202 for attr in attrs:
203 attr, value = splitvalue(attr)
204 if string.lower(attr) == 'type' and \
205 value in ('a', 'A', 'i', 'I', 'd', 'D'):
206 type = string.upper(value)
207 return addinfo(self.ftpcache[key].retrfile(file, type),
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000208 noheaders())
209 except ftperrors(), msg:
210 raise IOError, ('ftp error', msg)
211
212
213# Utility functions
214
215# Return the IP address of the magic hostname 'localhost'
216_localhost = None
217def localhost():
218 global _localhost
219 if not _localhost:
220 _localhost = socket.gethostbyname('localhost')
221 return _localhost
222
223# Return the IP address of the current host
224_thishost = None
225def thishost():
226 global _thishost
227 if not _thishost:
228 _thishost = socket.gethostbyname(socket.gethostname())
229 return _thishost
230
231# Return the set of errors raised by the FTP class
232_ftperrors = None
233def ftperrors():
234 global _ftperrors
235 if not _ftperrors:
236 import ftplib
237 _ftperrors = (ftplib.error_reply,
238 ftplib.error_temp,
239 ftplib.error_perm,
240 ftplib.error_proto)
241 return _ftperrors
242
243# Return an empty rfc822.Message object
244_noheaders = None
245def noheaders():
246 global _noheaders
247 if not _noheaders:
248 import rfc822
249 _noheaders = rfc822.Message(open('/dev/null', 'r'))
250 _noheaders.fp.close() # Recycle file descriptor
251 return _noheaders
252
253
254# Utility classes
255
256# Class used by open_ftp() for cache of open FTP connections
257class ftpwrapper:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000258 def __init__(self, user, passwd, host, port, dirs):
259 self.user = unquote(user or '')
260 self.passwd = unquote(passwd or '')
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000261 self.host = host
262 self.port = port
Guido van Rossum7c395db1994-07-04 22:14:49 +0000263 self.dirs = []
264 for dir in dirs:
265 self.dirs.append(unquote(dir))
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000266 self.init()
267 def init(self):
268 import ftplib
269 self.ftp = ftplib.FTP()
270 self.ftp.connect(self.host, self.port)
Guido van Rossum7c395db1994-07-04 22:14:49 +0000271 self.ftp.login(self.user, self.passwd)
272 for dir in self.dirs:
273 self.ftp.cwd(dir)
274 def retrfile(self, file, type):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000275 import ftplib
Guido van Rossum7c395db1994-07-04 22:14:49 +0000276 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
277 else: cmd = 'TYPE ' + type; isdir = 0
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000278 try:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000279 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000280 except ftplib.all_errors:
281 self.init()
Guido van Rossum7c395db1994-07-04 22:14:49 +0000282 self.ftp.voidcmd(cmd)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000283 conn = None
Guido van Rossum7c395db1994-07-04 22:14:49 +0000284 if file and not isdir:
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000285 try:
286 cmd = 'RETR ' + file
287 conn = self.ftp.transfercmd(cmd)
288 except ftplib.error_perm, reason:
289 if reason[:3] != '550':
290 raise IOError, ('ftp error', reason)
291 if not conn:
292 # Try a directory listing
293 if file: cmd = 'LIST ' + file
294 else: cmd = 'LIST'
295 conn = self.ftp.transfercmd(cmd)
296 return addclosehook(conn.makefile('r'), self.ftp.voidresp)
297
298# Base class for addinfo and addclosehook
299class addbase:
300 def __init__(self, fp):
301 self.fp = fp
302 self.read = self.fp.read
303 self.readline = self.fp.readline
304 self.readlines = self.fp.readlines
305 self.fileno = self.fp.fileno
306 def __repr__(self):
307 return '<%s at %s whose fp = %s>' % (
308 self.__class__.__name__, `id(self)`, `self.fp`)
309 def __del__(self):
310 self.close()
311 def close(self):
312 self.read = None
313 self.readline = None
314 self.readlines = None
315 self.fileno = None
316 self.fp = None
317
318# Class to add a close hook to an open file
319class addclosehook(addbase):
320 def __init__(self, fp, closehook, *hookargs):
321 addbase.__init__(self, fp)
322 self.closehook = closehook
323 self.hookargs = hookargs
324 def close(self):
325 if self.closehook:
326 apply(self.closehook, self.hookargs)
327 self.closehook = None
328 self.hookargs = None
329 addbase.close(self)
330
331# class to add an info() method to an open file
332class addinfo(addbase):
333 def __init__(self, fp, headers):
334 addbase.__init__(self, fp)
335 self.headers = headers
336 def info(self):
337 return self.headers
338
339
340# Utility to combine a URL with a base URL to form a new URL
341
342def basejoin(base, url):
343 type, path = splittype(url)
344 if type: return url
345 host, path = splithost(path)
346 basetype, basepath = splittype(base)
347 basehost, basepath = splithost(basepath)
348 basepath, basetag = splittag(basepath)
349 basepath, basequery = splitquery(basepath)
350 type = basetype or 'file'
351 if path[:1] != '/':
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000352 i = string.rfind(basepath, '/')
353 if i < 0: basepath = '/'
354 else: basepath = basepath[:i+1]
355 path = basepath + path
356 if not host: host = basehost
357 if host: return type + '://' + host + path
358 else: return type + ':' + path
359
360
Guido van Rossum7c395db1994-07-04 22:14:49 +0000361# Utilities to parse URLs (most of these return None for missing parts):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000362# unwrap('<URL:type//host/path>') --> 'type//host/path'
363# splittype('type:opaquestring') --> 'type', 'opaquestring'
364# splithost('//host[:port]/path') --> 'host[:port]', '/path'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000365# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
366# splitpasswd('user:passwd') -> 'user', 'passwd'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000367# splitport('host:port') --> 'host', 'port'
368# splitquery('/path?query') --> '/path', 'query'
369# splittag('/path#tag') --> '/path', 'tag'
Guido van Rossum7c395db1994-07-04 22:14:49 +0000370# splitattr('/path;attr1=value1;attr2=value2;...') ->
371# '/path', ['attr1=value1', 'attr2=value2', ...]
372# splitvalue('attr=value') --> 'attr', 'value'
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000373# splitgophertype('/Xselector') --> 'X', 'selector'
374# unquote('abc%20def') -> 'abc def'
375# quote('abc def') -> 'abc%20def')
376
377def unwrap(url):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000378 url = string.strip(url)
379 if url[:1] == '<' and url[-1:] == '>':
380 url = string.strip(url[1:-1])
381 if url[:4] == 'URL:': url = string.strip(url[4:])
382 return url
383
384_typeprog = regex.compile('^\([^/:]+\):\(.*\)$')
385def splittype(url):
386 if _typeprog.match(url) >= 0: return _typeprog.group(1, 2)
387 return None, url
388
389_hostprog = regex.compile('^//\([^/]+\)\(.*\)$')
390def splithost(url):
391 if _hostprog.match(url) >= 0: return _hostprog.group(1, 2)
392 return None, url
393
Guido van Rossum7c395db1994-07-04 22:14:49 +0000394_userprog = regex.compile('^\([^@]*\)@\(.*\)$')
395def splituser(host):
396 if _userprog.match(host) >= 0: return _userprog.group(1, 2)
397 return None, host
398
399_passwdprog = regex.compile('^\([^:]*\):\(.*\)$')
400def splitpasswd(user):
401 if _passwdprog.match(user) >= 0: return _passwdprog.group(1, 2)
402 return user, None
403
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000404_portprog = regex.compile('^\(.*\):\([0-9]+\)$')
405def splitport(host):
406 if _portprog.match(host) >= 0: return _portprog.group(1, 2)
407 return host, None
408
409_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$')
410def splitquery(url):
411 if _queryprog.match(url) >= 0: return _queryprog.group(1, 2)
412 return url, None
413
414_tagprog = regex.compile('^\(.*\)#\([^#]*\)$')
415def splittag(url):
416 if _tagprog.match(url) >= 0: return _tagprog.group(1, 2)
417 return url, None
418
Guido van Rossum7c395db1994-07-04 22:14:49 +0000419def splitattr(url):
420 words = string.splitfields(url, ';')
421 return words[0], words[1:]
422
423_valueprog = regex.compile('^\([^=]*\)=\(.*\)$')
424def splitvalue(attr):
425 if _valueprog.match(attr) >= 0: return _valueprog.group(1, 2)
426 return attr, None
427
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000428def splitgophertype(selector):
429 if selector[:1] == '/' and selector[1:2]:
430 return selector[1], selector[2:]
431 return None, selector
432
433_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]')
434def unquote(s):
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000435 i = 0
436 n = len(s)
437 res = ''
438 while 0 <= i < n:
439 j = _quoteprog.search(s, i)
440 if j < 0:
441 res = res + s[i:]
442 break
443 res = res + (s[i:j] + chr(eval('0x' + s[j+1:j+3])))
444 i = j+3
445 return res
446
Guido van Rossum7c395db1994-07-04 22:14:49 +0000447always_safe = string.letters + string.digits + '_,.+-'
448def quote(s, safe = '/'):
449 safe = always_safe + safe
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000450 res = ''
451 for c in s:
Guido van Rossum7c395db1994-07-04 22:14:49 +0000452 if c in safe:
453 res = res + c
454 else:
455 res = res + '%%%02x' % ord(c)
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000456 return res
457
458# Test and time quote() and unquote()
459def test1():
460 import time
461 s = ''
462 for i in range(256): s = s + chr(i)
463 s = s*4
464 t0 = time.time()
465 qs = quote(s)
466 uqs = unquote(qs)
467 t1 = time.time()
468 if uqs != s:
469 print 'Wrong!'
470 print `s`
471 print `qs`
472 print `uqs`
473 print round(t1 - t0, 3), 'sec'
474
475
476# Test program
477def test():
478 import sys
479 import regsub
480 args = sys.argv[1:]
481 if not args:
482 args = [
483 '/etc/passwd',
484 'file:/etc/passwd',
485 'file://localhost/etc/passwd',
486 'ftp://ftp.cwi.nl/etc/passwd',
487 'gopher://gopher.cwi.nl/11/',
488 'http://www.cwi.nl/index.html',
489 ]
490 try:
491 for url in args:
492 print '-'*10, url, '-'*10
493 fn, h = urlretrieve(url)
494 print fn, h
495 if h:
496 print '======'
497 for k in h.keys(): print k + ':', h[k]
498 print '======'
499 fp = open(fn, 'r')
500 data = fp.read()
501 del fp
502 print regsub.gsub('\r', '', data)
503 fn, h = None, None
504 print '-'*40
505 finally:
506 urlcleanup()
507
508# Run test program when run as a script
509if __name__ == '__main__':
Guido van Rossum7c395db1994-07-04 22:14:49 +0000510## test1()
Guido van Rossum7c6ebb51994-03-22 12:05:32 +0000511 test()