blob: 40a6715a7fbb2c985973c55a553c1a317eac39e2 [file] [log] [blame]
Jeremy Hylton6d7e47b2000-01-20 18:19:08 +00001"""An extensible library for opening URLs using a variety protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirectory manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301 & 302 redirect errors, and the HTTPDigestAuthHandler deals
15with digest authentication.
16
17urlopen(url, data=None) -- basic usage is that same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- function that creates a new OpenerDirector instance.
25will install the default handlers. accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. if one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- installs a new opener as the default opener.
31
32objects of interest:
33OpenerDirector --
34
35Request -- an object that encapsulates the state of a request. the
36state can be a simple as the URL. it can also include extra HTTP
37headers, e.g. a User-Agent.
38
39BaseHandler --
40
41exceptions:
42URLError-- a subclass of IOError, individual protocols have their own
43specific subclass
44
45HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46as an exceptional event or valid response
47
48internals:
49BaseHandler and parent
50_call_chain conventions
51
52Example usage:
53
54import urllib2
55
56# set up authentication info
57authinfo = urllib2.HTTPBasicAuthHandler()
58authinfo.add_password('realm', 'host', 'username', 'password')
59
60# build a new opener that adds authentication and caching FTP handlers
61opener = urllib2.build_opener(authinfo, urllib2.CacheFTPHandler)
62
63# install it
64urllib2.install_opener(opener)
65
66f = urllib2.urlopen('http://www.python.org/')
67
68
69"""
70
71# XXX issues:
72# If an authentication error handler that tries to perform
73 # authentication for some reason but fails, how should the error be
74 # signalled? The client needs to know the HTTP error code. But if
75 # the handler knows that the problem was, e.g., that it didn't know
76 # that hash algo that requested in the challenge, it would be good to
77 # pass that information along to the client, too.
78
79# XXX to do:
80# name!
81# documentation (getting there)
82# complex proxies
83# abstract factory for opener
84# ftp errors aren't handled cleanly
85# gopher can return a socket.error
86# check digest against correct (i.e. non-apache) implementation
87
88import string
89import socket
90import UserDict
91import httplib
92import re
93import base64
94import types
95import urlparse
96import os
97import md5
98import mimetypes
99import mimetools
100import ftplib
101import sys
102import time
103import gopherlib
104
105try:
106 from cStringIO import StringIO
107except ImportError:
108 from StringIO import StringIO
109
110try:
111 import sha
112except ImportError:
113 # need 1.5.2 final
114 sha = None
115
116# not sure how many of these need to be gotten rid of
117from urllib import unwrap, unquote, splittype, splithost, \
118 addinfourl, splitport, splitgophertype, splitquery, \
119 splitattr, ftpwrapper, noheaders
120
121# support for proxies via environment variables
122from urllib import getproxies
123
124# support for FileHandler
125from urllib import localhost, thishost, url2pathname, pathname2url
126
127# support for GopherHandler
128from urllib import splitgophertype, splitquery
129
130__version__ = "2.0a1"
131
132_opener = None
133def urlopen(url, data=None):
134 global _opener
135 if _opener is None:
136 _opener = build_opener()
137 return _opener.open(url, data)
138
139def install_opener(opener):
140 global _opener
141 _opener = opener
142
143# do these error classes make sense?
144# make sure all of the IOError stuff is overriden. we just want to be
145 # subtypes.
146
147class URLError(IOError):
148 # URLError is a sub-type of IOError, but it doesn't share any of
149 # the implementation. need to override __init__ and __str__
150 def __init__(self, reason):
151 self.reason = reason
152
153 def __str__(self):
154 return '<urlopen error %s>' % self.reason
155
156class HTTPError(URLError, addinfourl):
157 """Raised when HTTP error occurs, but also acts like non-error return"""
158
159 def __init__(self, url, code, msg, hdrs, fp):
160 addinfourl.__init__(self, fp, hdrs, url)
161 self.code = code
162 self.msg = msg
163 self.hdrs = hdrs
164 self.fp = fp
165 # XXX
166 self.filename = url
167
168 def __str__(self):
169 return 'HTTP Error %s: %s' % (self.code, self.msg)
170
171 def __del__(self):
172 # XXX is this safe? what if user catches exception, then
173 # extracts fp and discards exception?
174 self.fp.close()
175
176class GopherError(URLError):
177 pass
178
179class Request:
180 def __init__(self, url, data=None, headers={}):
181 # unwrap('<URL:type://host/path>') --> 'type://host/path'
182 self.__original = unwrap(url)
183 self.type = None
184 # self.__r_type is what's left after doing the splittype
185 self.host = None
186 self.port = None
187 self.data = data
188 self.headers = {}
189 self.headers.update(headers)
190
191 def __getattr__(self, attr):
192 # XXX this is a fallback mechanism to guard against these
193 # methods getting called in a non-standard order. this may be
194 # too complicated and/or unnecessary.
195 # XXX should the __r_XXX attributes be public?
196 if attr[:12] == '_Request__r_':
197 name = attr[12:]
198 if hasattr(Request, 'get_' + name):
199 getattr(self, 'get_' + name)()
200 return getattr(self, attr)
201 raise AttributeError, attr
202
203 def add_data(self, data):
204 self.data = data
205
206 def has_data(self):
207 return self.data is not None
208
209 def get_data(self):
210 return self.data
211
212 def get_full_url(self):
213 return self.__original
214
215 def get_type(self):
216 if self.type is None:
217 self.type, self.__r_type = splittype(self.__original)
218 return self.type
219
220 def get_host(self):
221 if self.host is None:
222 self.host, self.__r_host = splithost(self.__r_type)
223 if self.host:
224 self.host = unquote(self.host)
225 return self.host
226
227 def get_selector(self):
228 return self.__r_host
229
230 def set_proxy(self, proxy):
231 self.__proxy = proxy
232 # XXX this code is based on urllib, but it doesn't seem
233 # correct. specifically, if the proxy has a port number then
234 # splittype will return the hostname as the type and the port
235 # will be include with everything else
236 self.type, self.__r_type = splittype(self.__proxy)
237 self.host, XXX = splithost(self.__r_type)
238 self.host = unquote(self.host)
239 self.__r_host = self.__original
240
241 def add_header(self, key, val):
242 # useful for something like authentication
243 self.headers[key] = val
244
245class OpenerDirector:
246 def __init__(self):
247 server_version = "Python-urllib/%s" % __version__
248 self.addheaders = [('User-agent', server_version)]
249 # manage the individual handlers
250 self.handlers = []
251 self.handle_open = {}
252 self.handle_error = {}
253
254 def add_handler(self, handler):
255 added = 0
256 for meth in get_methods(handler):
257 if meth[-5:] == '_open':
258 protocol = meth[:-5]
259 if self.handle_open.has_key(protocol):
260 self.handle_open[protocol].append(handler)
261 else:
262 self.handle_open[protocol] = [handler]
263 added = 1
264 continue
265 i = string.find(meth, '_')
266 j = string.find(meth[i+1:], '_') + i + 1
267 if j != -1 and meth[i+1:j] == 'error':
268 proto = meth[:i]
269 kind = meth[j+1:]
270 try:
271 kind = string.atoi(kind)
272 except ValueError:
273 pass
274 dict = self.handle_error.get(proto, {})
275 if dict.has_key(kind):
276 dict[kind].append(handler)
277 else:
278 dict[kind] = [handler]
279 self.handle_error[proto] = dict
280 added = 1
281 continue
282 if added:
283 self.handlers.append(handler)
284 handler.add_parent(self)
285
286 def __del__(self):
287 self.close()
288
289 def close(self):
290 for handler in self.handlers:
291 handler.close()
292 self.handlers = []
293
294 def _call_chain(self, chain, kind, meth_name, *args):
295 # XXX raise an exception if no one else should try to handle
296 # this url. return None if you can't but someone else could.
297 handlers = chain.get(kind, ())
298 for handler in handlers:
299 func = getattr(handler, meth_name)
300 result = apply(func, args)
301 if result is not None:
302 return result
303
304 def open(self, fullurl, data=None):
305 # accept a URL or a Request object
306 if type(fullurl) == types.StringType:
307 req = Request(fullurl, data)
308 else:
309 req = fullurl
310 if data is not None:
311 req.add_data(data)
312 assert isinstance(req, Request) # really only care about interface
313
314 result = self._call_chain(self.handle_open, 'default',
315 'default_open', req)
316 if result:
317 return result
318
319 type_ = req.get_type()
320 result = self._call_chain(self.handle_open, type_, type_ + \
321 '_open', req)
322 if result:
323 return result
324
325 return self._call_chain(self.handle_open, 'unknown',
326 'unknown_open', req)
327
328 def error(self, proto, *args):
329 if proto == 'http':
330 # XXX http protocol is special cased
331 dict = self.handle_error[proto]
332 proto = args[2] # YUCK!
333 meth_name = 'http_error_%d' % proto
334 http_err = 1
335 orig_args = args
336 else:
337 dict = self.handle_error
338 meth_name = proto + '_error'
339 http_err = 0
340 args = (dict, proto, meth_name) + args
341 result = apply(self._call_chain, args)
342 if result:
343 return result
344
345 if http_err:
346 args = (dict, 'default', 'http_error_default') + orig_args
347 return apply(self._call_chain, args)
348
349def is_callable(obj):
350 # not quite like builtin callable (which I didn't know existed),
351 # not entirely sure it needs to be different
352 if type(obj) in (types.BuiltinFunctionType,
353 types.BuiltinMethodType, types.LambdaType,
354 types.MethodType):
355 return 1
356 if type(obj) == types.InstanceType:
357 return hasattr(obj, '__call__')
358 return 0
359
360def get_methods(inst):
361 methods = {}
362 classes = []
363 classes.append(inst.__class__)
364 while classes:
365 klass = classes[0]
366 del classes[0]
367 classes = classes + list(klass.__bases__)
368 for name in dir(klass):
369 attr = getattr(klass, name)
370 if type(attr) == types.UnboundMethodType:
371 methods[name] = 1
372 for name in dir(inst):
373 if is_callable(getattr(inst, name)):
374 methods[name] = 1
375 return methods.keys()
376
377# XXX probably also want an abstract factory that knows things like
378 # the fact that a ProxyHandler needs to get inserted first.
379# would also know when it makes sense to skip a superclass in favor of
380 # a subclass and when it might make sense to include both
381
382def build_opener(*handlers):
383 """Create an opener object from a list of handlers.
384
385 The opener will use several default handlers, including support
386 for HTTP and FTP. If there is a ProxyHandler, it must be at the
387 front of the list of handlers. (Yuck.)
388
389 If any of the handlers passed as arguments are subclasses of the
390 default handlers, the default handlers will not be used.
391 """
392
393 opener = OpenerDirector()
394 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
395 HTTPDefaultErrorHandler, HTTPRedirectHandler,
396 FTPHandler, FileHandler]
397 skip = []
398 for klass in default_classes:
399 for check in handlers:
400 if type(check) == types.ClassType:
401 if issubclass(check, klass):
402 skip.append(klass)
403 elif type(check) == types.InstanceType:
404 if isinstance(check, klass):
405 skip.append(klass)
406 for klass in skip:
407 default_classes.remove(klass)
408
409 for klass in default_classes:
410 opener.add_handler(klass())
411
412 for h in handlers:
413 if type(h) == types.ClassType:
414 h = h()
415 opener.add_handler(h)
416 return opener
417
418class BaseHandler:
419 def add_parent(self, parent):
420 self.parent = parent
421 def close(self):
422 self.parent = None
423
424class HTTPDefaultErrorHandler(BaseHandler):
425 def http_error_default(self, req, fp, code, msg, hdrs):
426 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
427
428class HTTPRedirectHandler(BaseHandler):
429 # Implementation note: To avoid the server sending us into an
430 # infinite loop, the request object needs to track what URLs we
431 # have already seen. Do this by adding a handler-specific
432 # attribute to the Request object.
433 def http_error_302(self, req, fp, code, msg, headers):
434 if headers.has_key('location'):
435 newurl = headers['location']
436 elif headers.has_key('uri'):
437 newurl = headers['uri']
438 else:
439 return
440 nil = fp.read()
441 fp.close()
442
443 # XXX Probably want to forget about the state of the current
444 # request, although that might interact poorly with other
445 # handlers that also use handler-specific request attributes
446 new = Request(newurl, req.get_data())
447 new.error_302_dict = {}
448 if hasattr(req, 'error_302_dict'):
449 if req.error_302_dict.has_key(newurl):
450 raise HTTPError(req.get_full_url(), code,
451 self.inf_msg + msg, headers)
452 new.error_302_dict.update(req.error_302_dict)
453 new.error_302_dict[newurl] = newurl
454 return self.parent.open(new)
455
456 http_error_301 = http_error_302
457
458 inf_msg = "The HTTP server returned a redirect error that would" \
459 "lead to an inifinte loop.\n" \
460 "The last 302 error message was:\n"
461
462class ProxyHandler(BaseHandler):
463 def __init__(self, proxies=None):
464 if proxies is None:
465 proxies = getproxies()
466 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
467 self.proxies = proxies
468 for type, url in proxies.items():
469 setattr(self, '%s_open' % type,
470 lambda r, proxy=url, type=type, meth=self.proxy_open: \
471 meth(r, proxy, type))
472
473 def proxy_open(self, req, proxy, type):
474 orig_type = req.get_type()
475 req.set_proxy(proxy)
476 if orig_type == type:
477 # let other handlers take care of it
478 # XXX this only makes sense if the proxy is before the
479 # other handlers
480 return None
481 else:
482 # need to start over, because the other handlers don't
483 # grok the proxy's URL type
484 return self.parent.open(req)
485
486# feature suggested by Duncan Booth
487# XXX custom is not a good name
488class CustomProxy:
489 # either pass a function to the constructor or override handle
490 def __init__(self, proto, func=None, proxy_addr=None):
491 self.proto = proto
492 self.func = func
493 self.addr = proxy_addr
494
495 def handle(self, req):
496 if self.func and self.func(req):
497 return 1
498
499 def get_proxy(self):
500 return self.addr
501
502class CustomProxyHandler(BaseHandler):
503 def __init__(self, *proxies):
504 self.proxies = {}
505
506 def proxy_open(self, req):
507 proto = req.get_type()
508 try:
509 proxies = self.proxies[proto]
510 except KeyError:
511 return None
512 for p in proxies:
513 if p.handle(req):
514 req.set_proxy(p.get_proxy())
515 return self.parent.open(req)
516 return None
517
518 def do_proxy(self, p, req):
519 p
520 return self.parent.open(req)
521
522 def add_proxy(self, cpo):
523 if self.proxies.has_key(cpo.proto):
524 self.proxies[cpo.proto].append(cpo)
525 else:
526 self.proxies[cpo.proto] = [cpo]
527
528class HTTPPasswordMgr:
529 def __init__(self):
530 self.passwd = {}
531
532 def add_password(self, realm, uri, user, passwd):
533 # uri could be a single URI or a sequence
534 if type(uri) == types.StringType:
535 uri = [uri]
536 uri = tuple(map(self.reduce_uri, uri))
537 if not self.passwd.has_key(realm):
538 self.passwd[realm] = {}
539 self.passwd[realm][uri] = (user, passwd)
540
541 def find_user_password(self, realm, authuri):
542 domains = self.passwd.get(realm, {})
543 authuri = self.reduce_uri(authuri)
544 for uris, authinfo in domains.items():
545 for uri in uris:
546 if self.is_suburi(uri, authuri):
547 return authinfo
548 return None, None
549
550 def reduce_uri(self, uri):
551 """Accept netloc or URI and extract only the netloc and path"""
552 parts = urlparse.urlparse(uri)
553 if parts[1]:
554 return parts[1], parts[2] or '/'
555 else:
556 return parts[2], '/'
557
558 def is_suburi(self, base, test):
559 """Check if test is below base in a URI tree
560
561 Both args must be URIs in reduced form.
562 """
563 if base == test:
564 return 1
565 if base[0] != test[0]:
566 return 0
567 common = os.path.commonprefix((base[1], test[1]))
568 if len(common) == len(base[1]):
569 return 1
570 return 0
571
572
573class HTTPBasicAuthHandler(BaseHandler):
574 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"')
575
576 # XXX there can actually be multiple auth-schemes in a
577 # www-authenticate header. should probably be a lot more careful
578 # in parsing them to extract multiple alternatives
579
580 def __init__(self):
581 self.passwd = HTTPPasswordMgr()
582 self.add_password = self.passwd.add_password
583 self.__current_realm = None
584 # if __current_realm is not None, then the server must have
585 # refused our name/password and is asking for authorization
586 # again. must be careful to set it to None on successful
587 # return.
588
589 def http_error_401(self, req, fp, code, msg, headers):
590 # XXX could be mult. headers
591 authreq = headers.get('www-authenticate', None)
592 if authreq:
593 mo = HTTPBasicAuthHandler.rx.match(authreq)
594 if mo:
595 scheme, realm = mo.groups()
596 if string.lower(scheme) == 'basic':
597 return self.retry_http_basic_auth(req, realm)
598
599 def retry_http_basic_auth(self, req, realm):
600 if self.__current_realm is None:
601 self.__current_realm = realm
602 else:
603 self.__current_realm = realm
604 return None
605 # XXX host isn't really the correct URI?
606 host = req.get_host()
607 user,pw = self.passwd.find_user_password(realm, host)
608 if pw:
609 raw = "%s:%s" % (user, pw)
610 auth = string.strip(base64.encodestring(raw))
611 req.add_header('Authorization', 'Basic %s' % auth)
612 resp = self.parent.open(req)
613 self.__current_realm = None
614 return resp
615 else:
616 self.__current_realm = None
617 return None
618
619class HTTPDigestAuthHandler(BaseHandler):
620 """An authentication protocol defined by RFC 2069
621
622 Digest authentication improves on basic authentication because it
623 does not transmit passwords in the clear.
624 """
625
626 def __init__(self):
627 self.passwd = HTTPPasswordMgr()
628 self.add_password = self.passwd.add_password
629 self.__current_realm = None
630
631 def http_error_401(self, req, fp, code, msg, headers):
632 # XXX could be mult. headers
633 authreq = headers.get('www-authenticate', None)
634 if authreq:
635 kind = string.split(authreq)[0]
636 if kind == 'Digest':
637 return self.retry_http_digest_auth(req, authreq)
638
639 def retry_http_digest_auth(self, req, auth):
640 token, challenge = string.split(auth, ' ', 1)
641 chal = parse_keqv_list(parse_http_list(challenge))
642 auth = self.get_authorization(req, chal)
643 if auth:
644 req.add_header('Authorization', 'Digest %s' % auth)
645 resp = self.parent.open(req)
646 self.__current_realm = None
647 return resp
648
649 def get_authorization(self, req, chal):
650 try:
651 realm = chal['realm']
652 nonce = chal['nonce']
653 algorithm = chal.get('algorithm', 'MD5')
654 # mod_digest doesn't send an opaque, even though it isn't
655 # supposed to be optional
656 opaque = chal.get('opaque', None)
657 except KeyError:
658 return None
659
660 if self.__current_realm is None:
661 self.__current_realm = realm
662 else:
663 self.__current_realm = realm
664 return None
665
666 H, KD = self.get_algorithm_impls(algorithm)
667 if H is None:
668 return None
669
670 user, pw = self.passwd.find_user_password(realm,
671 req.get_full_url())
672 if user is None:
673 return None
674
675 # XXX not implemented yet
676 if req.has_data():
677 entdig = self.get_entity_digest(req.get_data(), chal)
678 else:
679 entdig = None
680
681 A1 = "%s:%s:%s" % (user, realm, pw)
682 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
683 # XXX selector: what about proxies and full urls
684 req.get_selector())
685 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
686 # XXX should the partial digests be encoded too?
687
688 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
689 'response="%s"' % (user, realm, nonce, req.get_selector(),
690 respdig)
691 if opaque:
692 base = base + ', opaque="%s"' % opaque
693 if entdig:
694 base = base + ', digest="%s"' % entdig
695 if algorithm != 'MD5':
696 base = base + ', algorithm="%s"' % algorithm
697 return base
698
699 def get_algorithm_impls(self, algorithm):
700 # lambdas assume digest modules are imported at the top level
701 if algorithm == 'MD5':
702 H = lambda x, e=encode_digest:e(md5.new(x).digest())
703 elif algorithm == 'SHA':
704 H = lambda x, e=encode_digest:e(sha.new(x).digest())
705 # XXX MD5-sess
706 KD = lambda s, d, H=H: H("%s:%s" % (s, d))
707 return H, KD
708
709 def get_entity_digest(self, data, chal):
710 # XXX not implemented yet
711 return None
712
713def encode_digest(digest):
714 hexrep = []
715 for c in digest:
716 n = (ord(c) >> 4) & 0xf
717 hexrep.append(hex(n)[-1])
718 n = ord(c) & 0xf
719 hexrep.append(hex(n)[-1])
720 return string.join(hexrep, '')
721
722
723class HTTPHandler(BaseHandler):
724 def http_open(self, req):
725 # XXX devise a new mechanism to specify user/password
726 host = req.get_host()
727 if not host:
728 raise URLError('no host given')
729
730 h = httplib.HTTP(host) # will parse host:port
731## h.set_debuglevel(1)
732 if req.has_data():
733 data = req.get_data()
734 h.putrequest('POST', req.get_selector())
735 h.putheader('Content-type', 'application/x-www-form-urlencoded')
736 h.putheader('Content-length', '%d' % len(data))
737 else:
738 h.putrequest('GET', req.get_selector())
739 # XXX proxies would have different host here
740 h.putheader('Host', host)
741 for args in self.parent.addheaders:
742 apply(h.putheader, args)
743 for k, v in req.headers.items():
744 h.putheader(k, v)
745 h.endheaders()
746 if req.has_data():
747 h.send(data + '\r\n')
748
749 code, msg, hdrs = h.getreply()
750 fp = h.getfile()
751 if code == 200:
752 return addinfourl(fp, hdrs, req.get_full_url())
753 else:
754 # want to make sure the socket is closed, even if error
755 # handling doesn't return immediately. the socket won't
756 # actually be closed until fp is also closed.
757 if h.sock:
758 h.sock.close()
759 h.sock = None
760 return self.parent.error('http', req, fp, code, msg, hdrs)
761
762class UnknownHandler(BaseHandler):
763 def unknown_open(self, req):
764 type = req.get_type()
765 raise URLError('unknown url type: %s' % type)
766
767def parse_keqv_list(l):
768 """Parse list of key=value strings where keys are not duplicated."""
769 parsed = {}
770 for elt in l:
771 k, v = string.split(elt, '=', 1)
772 if v[0] == '"' and v[-1] == '"':
773 v = v[1:-1]
774 parsed[k] = v
775 return parsed
776
777def parse_http_list(s):
778 """Parse lists as described by RFC 2068 Section 2.
779
780 In particular, parse comman-separated lists where the elements of
781 the list may include quoted-strings. A quoted-string could
782 contain a comma.
783 """
784 # XXX this function could probably use more testing
785
786 list = []
787 end = len(s)
788 i = 0
789 inquote = 0
790 start = 0
791 while i < end:
792 cur = s[i:]
793 c = string.find(cur, ',')
794 q = string.find(cur, '"')
795 if c == -1:
796 list.append(s[start:])
797 break
798 if q == -1:
799 if inquote:
800 raise ValueError, "unbalanced quotes"
801 else:
802 list.append(s[start:i+c])
803 i = i + c + 1
804 continue
805 if inquote:
806 if q < c:
807 list.append(s[start:i+c])
808 i = i + c + 1
809 start = i
810 inquote = 0
811 else:
812 i = i + q
813 else:
814 if c < q:
815 list.append(s[start:i+c])
816 i = i + c + 1
817 start = i
818 else:
819 inquote = 1
820 i = i + q + 1
821 return map(string.strip, list)
822
823class FileHandler(BaseHandler):
824 # Use local file or FTP depending on form of URL
825 def file_open(self, req):
826 url = req.get_selector()
827 if url[:2] == '//' and url[2:3] != '/':
828 req.type = 'ftp'
829 return self.parent.open(req)
830 else:
831 return self.open_local_file(req)
832
833 # names for the localhost
834 names = None
835 def get_names(self):
836 if FileHandler.names is None:
837 FileHandler.names = (socket.gethostbyname('localhost'),
838 socket.gethostbyname(socket.gethostname()))
839 return FileHandler.names
840
841 # not entirely sure what the rules are here
842 def open_local_file(self, req):
843 mtype = mimetypes.guess_type(req.get_selector())[0]
844 headers = mimetools.Message(StringIO('Content-Type: %s\n' \
845 % (mtype or 'text/plain')))
846 host = req.get_host()
847 file = req.get_selector()
848 if host:
849 host, port = splitport(host)
850 if not host or \
851 (not port and socket.gethostbyname(host) in self.get_names()):
852 return addinfourl(open(url2pathname(file), 'rb'),
853 headers, 'file:'+file)
854 raise URLError('file not on local host')
855
856class FTPHandler(BaseHandler):
857 def ftp_open(self, req):
858 host = req.get_host()
859 if not host:
860 raise IOError, ('ftp error', 'no host given')
861 # XXX handle custom username & password
862 host = socket.gethostbyname(host)
863 host, port = splitport(host)
864 if port is None:
865 port = ftplib.FTP_PORT
866 path, attrs = splitattr(req.get_selector())
867 path = unquote(path)
868 dirs = string.splitfields(path, '/')
869 dirs, file = dirs[:-1], dirs[-1]
870 if dirs and not dirs[0]:
871 dirs = dirs[1:]
872 user = passwd = '' # XXX
873 try:
874 fw = self.connect_ftp(user, passwd, host, port, dirs)
875 type = file and 'I' or 'D'
876 for attr in attrs:
877 attr, value = splitattr(attr)
878 if string.lower(attr) == 'type' and \
879 value in ('a', 'A', 'i', 'I', 'd', 'D'):
880 type = string.upper(value)
881 fp, retrlen = fw.retrfile(file, type)
882 if retrlen is not None and retrlen >= 0:
883 sf = StringIO('Content-Length: %d\n' % retrlen)
884 headers = mimetools.Message(sf)
885 else:
886 headers = noheaders()
887 return addinfourl(fp, headers, req.get_full_url())
888 except ftplib.all_errors, msg:
889 raise IOError, ('ftp error', msg), sys.exc_info()[2]
890
891 def connect_ftp(self, user, passwd, host, port, dirs):
892 fw = ftpwrapper(user, passwd, host, port, dirs)
893## fw.ftp.set_debuglevel(1)
894 return fw
895
896class CacheFTPHandler(FTPHandler):
897 # XXX would be nice to have pluggable cache strategies
898 # XXX this stuff is definitely not thread safe
899 def __init__(self):
900 self.cache = {}
901 self.timeout = {}
902 self.soonest = 0
903 self.delay = 60
904 self.max_conns = 16
905
906 def setTimeout(self, t):
907 self.delay = t
908
909 def setMaxConns(self, m):
910 self.max_conns = m
911
912 def connect_ftp(self, user, passwd, host, port, dirs):
913 key = user, passwd, host, port
914 if self.cache.has_key(key):
915 self.timeout[key] = time.time() + self.delay
916 else:
917 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
918 self.timeout[key] = time.time() + self.delay
919 self.check_cache()
920 return self.cache[key]
921
922 def check_cache(self):
923 # first check for old ones
924 t = time.time()
925 if self.soonest <= t:
926 for k, v in self.timeout.items():
927 if v < t:
928 self.cache[k].close()
929 del self.cache[k]
930 del self.timeout[k]
931 self.soonest = min(self.timeout.values())
932
933 # then check the size
934 if len(self.cache) == self.max_conns:
935 for k, v in self.timeout.items():
936 if v == self.soonest:
937 del self.cache[k]
938 del self.timeout[k]
939 break
940 self.soonest = min(self.timeout.values())
941
942class GopherHandler(BaseHandler):
943 def gopher_open(self, req):
944 host = req.get_host()
945 if not host:
946 raise GopherError('no host given')
947 host = unquote(host)
948 selector = req.get_selector()
949 type, selector = splitgophertype(selector)
950 selector, query = splitquery(selector)
951 selector = unquote(selector)
952 if query:
953 query = unquote(query)
954 fp = gopherlib.send_query(selector, query, host)
955 else:
956 fp = gopherlib.send_selector(selector, host)
957 return addinfourl(fp, noheaders(), req.get_full_url())
958
959#bleck! don't use this yet
960class OpenerFactory:
961
962 default_handlers = [UnknownHandler, HTTPHandler,
963 HTTPDefaultErrorHandler, HTTPRedirectHandler,
964 FTPHandler, FileHandler]
965 proxy_handlers = [ProxyHandler]
966 handlers = []
967 replacement_handlers = []
968
969 def add_proxy_handler(self, ph):
970 self.proxy_handlers = self.proxy_handlers + [ph]
971
972 def add_handler(self, h):
973 self.handlers = self.handlers + [h]
974
975 def replace_handler(self, h):
976 pass
977
978 def build_opener(self):
979 opener = OpenerDirectory()
980 for ph in self.proxy_handlers:
981 if type(ph) == types.ClassType:
982 ph = ph()
983 opener.add_handler(ph)
984
985if __name__ == "__main__":
986 # XXX some of the test code depends on machine configurations that
987 # are internal to CNRI. Need to set up a public server with the
988 # right authentication configuration for test purposes.
989 if socket.gethostname() == 'bitdiddle':
990 localhost = 'bitdiddle.cnri.reston.va.us'
991 elif socket.gethostname() == 'walden':
992 localhost = 'localhost'
993 else:
994 localhost = None
995 urls = [
996 # Thanks to Fred for finding these!
997 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
998 'gopher://gopher.vt.edu:10010/10/33',
999
1000 'file:/etc/passwd',
1001 'file://nonsensename/etc/passwd',
1002 'ftp://www.python.org/pub/tmp/httplib.py',
1003 'ftp://www.python.org/pub/tmp/imageop.c',
1004 'ftp://www.python.org/pub/tmp/blat',
1005 'http://www.espn.com/', # redirect
1006 'http://www.python.org/Spanish/Inquistion/',
1007 ('http://grail.cnri.reston.va.us/cgi-bin/faqw.py',
1008 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1009 'http://www.python.org/',
1010 'ftp://prep.ai.mit.edu/welcome.msg',
1011 'ftp://www.python.org/pub/tmp/figure.prn',
1012 'ftp://www.python.org/pub/tmp/interp.pl',
1013 'http://checkproxy.cnri.reston.va.us/test/test.html',
1014 ]
1015
1016 if localhost is not None:
1017 urls = urls + [
1018 'file://%s/etc/passwd' % localhost,
1019 'http://%s/simple/' % localhost,
1020 'http://%s/digest/' % localhost,
1021 'http://%s/not/found.h' % localhost,
1022 ]
1023
1024 bauth = HTTPBasicAuthHandler()
1025 bauth.add_password('basic_test_realm', localhost, 'jhylton',
1026 'password')
1027 dauth = HTTPDigestAuthHandler()
1028 dauth.add_password('digest_test_realm', localhost, 'jhylton',
1029 'password')
1030
1031
1032 cfh = CacheFTPHandler()
1033 cfh.setTimeout(1)
1034
1035 # XXX try out some custom proxy objects too!
1036 def at_cnri(req):
1037 host = req.get_host()
1038 print host
1039 if host[-18:] == '.cnri.reston.va.us':
1040 return 1
1041 p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1042 ph = CustomProxyHandler(p)
1043
1044 install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1045
1046 for url in urls:
1047 if type(url) == types.TupleType:
1048 url, req = url
1049 else:
1050 req = None
1051 print url
1052 try:
1053 f = urlopen(url, req)
1054 except IOError, err:
1055 print "IOError:", err
1056 except socket.error, err:
1057 print "socket.error:", err
1058 else:
1059 buf = f.read()
1060 f.close()
1061 print "read %d bytes" % len(buf)
1062 print
1063 time.sleep(0.1)