blob: 61e9caa5201e4176c9f005d913ff4e5e936fdac8 [file] [log] [blame]
Joe Gregorio845a5452010-09-08 13:50:34 -04001from __future__ import generators
2"""
3httplib2
4
5A caching http interface that supports ETags and gzip
6to conserve bandwidth.
7
8Requires Python 2.3 or later
9
10Changelog:
112007-08-18, Rick: Modified so it's able to use a socks proxy if needed.
12
13"""
14
15__author__ = "Joe Gregorio (joe@bitworking.org)"
16__copyright__ = "Copyright 2006, Joe Gregorio"
17__contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
18 "James Antill",
19 "Xavier Verges Farrero",
20 "Jonathan Feinberg",
21 "Blair Zajac",
22 "Sam Ruby",
23 "Louis Nyffenegger"]
24__license__ = "MIT"
25__version__ = "$Rev$"
26
27import re
28import sys
29import email
30import email.Utils
31import email.Message
32import email.FeedParser
33import StringIO
34import gzip
35import zlib
36import httplib
37import urlparse
38import base64
39import os
40import copy
41import calendar
42import time
43import random
44import errno
45# remove depracated warning in python2.6
46try:
47 from hashlib import sha1 as _sha, md5 as _md5
48except ImportError:
49 import sha
50 import md5
51 _sha = sha.new
52 _md5 = md5.new
53import hmac
54from gettext import gettext as _
55import socket
56
57try:
58 from httplib2 import socks
59except ImportError:
60 socks = None
61
62# Build the appropriate socket wrapper for ssl
63try:
64 import ssl # python 2.6
65 _ssl_wrap_socket = ssl.wrap_socket
66except ImportError:
67 def _ssl_wrap_socket(sock, key_file, cert_file):
68 ssl_sock = socket.ssl(sock, key_file, cert_file)
69 return httplib.FakeSocket(sock, ssl_sock)
70
71
72if sys.version_info >= (2,3):
73 from iri2uri import iri2uri
74else:
75 def iri2uri(uri):
76 return uri
77
78def has_timeout(timeout): # python 2.6
79 if hasattr(socket, '_GLOBAL_DEFAULT_TIMEOUT'):
80 return (timeout is not None and timeout is not socket._GLOBAL_DEFAULT_TIMEOUT)
81 return (timeout is not None)
82
83__all__ = ['Http', 'Response', 'ProxyInfo', 'HttpLib2Error',
84 'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent',
85 'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
86 'debuglevel']
87
88
89# The httplib debug level, set to a non-zero value to get debug output
90debuglevel = 0
91
92
93# Python 2.3 support
94if sys.version_info < (2,4):
95 def sorted(seq):
96 seq.sort()
97 return seq
98
99# Python 2.3 support
100def HTTPResponse__getheaders(self):
101 """Return list of (header, value) tuples."""
102 if self.msg is None:
103 raise httplib.ResponseNotReady()
104 return self.msg.items()
105
106if not hasattr(httplib.HTTPResponse, 'getheaders'):
107 httplib.HTTPResponse.getheaders = HTTPResponse__getheaders
108
109# All exceptions raised here derive from HttpLib2Error
110class HttpLib2Error(Exception): pass
111
112# Some exceptions can be caught and optionally
113# be turned back into responses.
114class HttpLib2ErrorWithResponse(HttpLib2Error):
115 def __init__(self, desc, response, content):
116 self.response = response
117 self.content = content
118 HttpLib2Error.__init__(self, desc)
119
120class RedirectMissingLocation(HttpLib2ErrorWithResponse): pass
121class RedirectLimit(HttpLib2ErrorWithResponse): pass
122class FailedToDecompressContent(HttpLib2ErrorWithResponse): pass
123class UnimplementedDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
124class UnimplementedHmacDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
125
126class RelativeURIError(HttpLib2Error): pass
127class ServerNotFoundError(HttpLib2Error): pass
128
129# Open Items:
130# -----------
131# Proxy support
132
133# Are we removing the cached content too soon on PUT (only delete on 200 Maybe?)
134
135# Pluggable cache storage (supports storing the cache in
136# flat files by default. We need a plug-in architecture
137# that can support Berkeley DB and Squid)
138
139# == Known Issues ==
140# Does not handle a resource that uses conneg and Last-Modified but no ETag as a cache validator.
141# Does not handle Cache-Control: max-stale
142# Does not use Age: headers when calculating cache freshness.
143
144
145# The number of redirections to follow before giving up.
146# Note that only GET redirects are automatically followed.
147# Will also honor 301 requests by saving that info and never
148# requesting that URI again.
149DEFAULT_MAX_REDIRECTS = 5
150
151# Which headers are hop-by-hop headers by default
152HOP_BY_HOP = ['connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade']
153
154def _get_end2end_headers(response):
155 hopbyhop = list(HOP_BY_HOP)
156 hopbyhop.extend([x.strip() for x in response.get('connection', '').split(',')])
157 return [header for header in response.keys() if header not in hopbyhop]
158
159URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
160
161def parse_uri(uri):
162 """Parses a URI using the regex given in Appendix B of RFC 3986.
163
164 (scheme, authority, path, query, fragment) = parse_uri(uri)
165 """
166 groups = URI.match(uri).groups()
167 return (groups[1], groups[3], groups[4], groups[6], groups[8])
168
169def urlnorm(uri):
170 (scheme, authority, path, query, fragment) = parse_uri(uri)
171 if not scheme or not authority:
172 raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri)
173 authority = authority.lower()
174 scheme = scheme.lower()
175 if not path:
176 path = "/"
177 # Could do syntax based normalization of the URI before
178 # computing the digest. See Section 6.2.2 of Std 66.
179 request_uri = query and "?".join([path, query]) or path
180 scheme = scheme.lower()
181 defrag_uri = scheme + "://" + authority + request_uri
182 return scheme, authority, request_uri, defrag_uri
183
184
185# Cache filename construction (original borrowed from Venus http://intertwingly.net/code/venus/)
186re_url_scheme = re.compile(r'^\w+://')
187re_slash = re.compile(r'[?/:|]+')
188
189def safename(filename):
190 """Return a filename suitable for the cache.
191
192 Strips dangerous and common characters to create a filename we
193 can use to store the cache in.
194 """
195
196 try:
197 if re_url_scheme.match(filename):
198 if isinstance(filename,str):
199 filename = filename.decode('utf-8')
200 filename = filename.encode('idna')
201 else:
202 filename = filename.encode('idna')
203 except UnicodeError:
204 pass
205 if isinstance(filename,unicode):
206 filename=filename.encode('utf-8')
207 filemd5 = _md5(filename).hexdigest()
208 filename = re_url_scheme.sub("", filename)
209 filename = re_slash.sub(",", filename)
210
211 # limit length of filename
212 if len(filename)>200:
213 filename=filename[:200]
214 return ",".join((filename, filemd5))
215
216NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
217def _normalize_headers(headers):
218 return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip()) for (key, value) in headers.iteritems()])
219
220def _parse_cache_control(headers):
221 retval = {}
222 if headers.has_key('cache-control'):
223 parts = headers['cache-control'].split(',')
224 parts_with_args = [tuple([x.strip().lower() for x in part.split("=", 1)]) for part in parts if -1 != part.find("=")]
225 parts_wo_args = [(name.strip().lower(), 1) for name in parts if -1 == name.find("=")]
226 retval = dict(parts_with_args + parts_wo_args)
227 return retval
228
229# Whether to use a strict mode to parse WWW-Authenticate headers
230# Might lead to bad results in case of ill-formed header value,
231# so disabled by default, falling back to relaxed parsing.
232# Set to true to turn on, usefull for testing servers.
233USE_WWW_AUTH_STRICT_PARSING = 0
234
235# In regex below:
236# [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+ matches a "token" as defined by HTTP
237# "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?" matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space
238# Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both:
239# \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?
240WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$")
241WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$")
242UNQUOTE_PAIRS = re.compile(r'\\(.)')
243def _parse_www_authenticate(headers, headername='www-authenticate'):
244 """Returns a dictionary of dictionaries, one dict
245 per auth_scheme."""
246 retval = {}
247 if headers.has_key(headername):
248 authenticate = headers[headername].strip()
249 www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED
250 while authenticate:
251 # Break off the scheme at the beginning of the line
252 if headername == 'authentication-info':
253 (auth_scheme, the_rest) = ('digest', authenticate)
254 else:
255 (auth_scheme, the_rest) = authenticate.split(" ", 1)
256 # Now loop over all the key value pairs that come after the scheme,
257 # being careful not to roll into the next scheme
258 match = www_auth.search(the_rest)
259 auth_params = {}
260 while match:
261 if match and len(match.groups()) == 3:
262 (key, value, the_rest) = match.groups()
263 auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')])
264 match = www_auth.search(the_rest)
265 retval[auth_scheme.lower()] = auth_params
266 authenticate = the_rest.strip()
267 return retval
268
269
270def _entry_disposition(response_headers, request_headers):
271 """Determine freshness from the Date, Expires and Cache-Control headers.
272
273 We don't handle the following:
274
275 1. Cache-Control: max-stale
276 2. Age: headers are not used in the calculations.
277
278 Not that this algorithm is simpler than you might think
279 because we are operating as a private (non-shared) cache.
280 This lets us ignore 's-maxage'. We can also ignore
281 'proxy-invalidate' since we aren't a proxy.
282 We will never return a stale document as
283 fresh as a design decision, and thus the non-implementation
284 of 'max-stale'. This also lets us safely ignore 'must-revalidate'
285 since we operate as if every server has sent 'must-revalidate'.
286 Since we are private we get to ignore both 'public' and
287 'private' parameters. We also ignore 'no-transform' since
288 we don't do any transformations.
289 The 'no-store' parameter is handled at a higher level.
290 So the only Cache-Control parameters we look at are:
291
292 no-cache
293 only-if-cached
294 max-age
295 min-fresh
296 """
297
298 retval = "STALE"
299 cc = _parse_cache_control(request_headers)
300 cc_response = _parse_cache_control(response_headers)
301
302 if request_headers.has_key('pragma') and request_headers['pragma'].lower().find('no-cache') != -1:
303 retval = "TRANSPARENT"
304 if 'cache-control' not in request_headers:
305 request_headers['cache-control'] = 'no-cache'
306 elif cc.has_key('no-cache'):
307 retval = "TRANSPARENT"
308 elif cc_response.has_key('no-cache'):
309 retval = "STALE"
310 elif cc.has_key('only-if-cached'):
311 retval = "FRESH"
312 elif response_headers.has_key('date'):
313 date = calendar.timegm(email.Utils.parsedate_tz(response_headers['date']))
314 now = time.time()
315 current_age = max(0, now - date)
316 if cc_response.has_key('max-age'):
317 try:
318 freshness_lifetime = int(cc_response['max-age'])
319 except ValueError:
320 freshness_lifetime = 0
321 elif response_headers.has_key('expires'):
322 expires = email.Utils.parsedate_tz(response_headers['expires'])
323 if None == expires:
324 freshness_lifetime = 0
325 else:
326 freshness_lifetime = max(0, calendar.timegm(expires) - date)
327 else:
328 freshness_lifetime = 0
329 if cc.has_key('max-age'):
330 try:
331 freshness_lifetime = int(cc['max-age'])
332 except ValueError:
333 freshness_lifetime = 0
334 if cc.has_key('min-fresh'):
335 try:
336 min_fresh = int(cc['min-fresh'])
337 except ValueError:
338 min_fresh = 0
339 current_age += min_fresh
340 if freshness_lifetime > current_age:
341 retval = "FRESH"
342 return retval
343
344def _decompressContent(response, new_content):
345 content = new_content
346 try:
347 encoding = response.get('content-encoding', None)
348 if encoding in ['gzip', 'deflate']:
349 if encoding == 'gzip':
350 content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
351 if encoding == 'deflate':
352 content = zlib.decompress(content)
353 response['content-length'] = str(len(content))
354 # Record the historical presence of the encoding in a way the won't interfere.
355 response['-content-encoding'] = response['content-encoding']
356 del response['content-encoding']
357 except IOError:
358 content = ""
359 raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content)
360 return content
361
362def _updateCache(request_headers, response_headers, content, cache, cachekey):
363 if cachekey:
364 cc = _parse_cache_control(request_headers)
365 cc_response = _parse_cache_control(response_headers)
366 if cc.has_key('no-store') or cc_response.has_key('no-store'):
367 cache.delete(cachekey)
368 else:
369 info = email.Message.Message()
370 for key, value in response_headers.iteritems():
371 if key not in ['status','content-encoding','transfer-encoding']:
372 info[key] = value
373
374 # Add annotations to the cache to indicate what headers
375 # are variant for this request.
376 vary = response_headers.get('vary', None)
377 if vary:
378 vary_headers = vary.lower().replace(' ', '').split(',')
379 for header in vary_headers:
380 key = '-varied-%s' % header
381 try:
382 info[key] = request_headers[header]
383 except KeyError:
384 pass
385
386 status = response_headers.status
387 if status == 304:
388 status = 200
389
390 status_header = 'status: %d\r\n' % response_headers.status
391
392 header_str = info.as_string()
393
394 header_str = re.sub("\r(?!\n)|(?<!\r)\n", "\r\n", header_str)
395 text = "".join([status_header, header_str, content])
396
397 cache.set(cachekey, text)
398
399def _cnonce():
400 dig = _md5("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
401 return dig[:16]
402
403def _wsse_username_token(cnonce, iso_now, password):
404 return base64.b64encode(_sha("%s%s%s" % (cnonce, iso_now, password)).digest()).strip()
405
406
407# For credentials we need two things, first
408# a pool of credential to try (not necesarily tied to BAsic, Digest, etc.)
409# Then we also need a list of URIs that have already demanded authentication
410# That list is tricky since sub-URIs can take the same auth, or the
411# auth scheme may change as you descend the tree.
412# So we also need each Auth instance to be able to tell us
413# how close to the 'top' it is.
414
415class Authentication(object):
416 def __init__(self, credentials, host, request_uri, headers, response, content, http):
417 (scheme, authority, path, query, fragment) = parse_uri(request_uri)
418 self.path = path
419 self.host = host
420 self.credentials = credentials
421 self.http = http
422
423 def depth(self, request_uri):
424 (scheme, authority, path, query, fragment) = parse_uri(request_uri)
425 return request_uri[len(self.path):].count("/")
426
427 def inscope(self, host, request_uri):
428 # XXX Should we normalize the request_uri?
429 (scheme, authority, path, query, fragment) = parse_uri(request_uri)
430 return (host == self.host) and path.startswith(self.path)
431
432 def request(self, method, request_uri, headers, content):
433 """Modify the request headers to add the appropriate
434 Authorization header. Over-rise this in sub-classes."""
435 pass
436
437 def response(self, response, content):
438 """Gives us a chance to update with new nonces
439 or such returned from the last authorized response.
440 Over-rise this in sub-classes if necessary.
441
442 Return TRUE is the request is to be retried, for
443 example Digest may return stale=true.
444 """
445 return False
446
447
448
449class BasicAuthentication(Authentication):
450 def __init__(self, credentials, host, request_uri, headers, response, content, http):
451 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
452
453 def request(self, method, request_uri, headers, content):
454 """Modify the request headers to add the appropriate
455 Authorization header."""
456 headers['authorization'] = 'Basic ' + base64.b64encode("%s:%s" % self.credentials).strip()
457
458
459class DigestAuthentication(Authentication):
460 """Only do qop='auth' and MD5, since that
461 is all Apache currently implements"""
462 def __init__(self, credentials, host, request_uri, headers, response, content, http):
463 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
464 challenge = _parse_www_authenticate(response, 'www-authenticate')
465 self.challenge = challenge['digest']
466 qop = self.challenge.get('qop', 'auth')
467 self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None
468 if self.challenge['qop'] is None:
469 raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop))
470 self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5').upper()
471 if self.challenge['algorithm'] != 'MD5':
472 raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
473 self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]])
474 self.challenge['nc'] = 1
475
476 def request(self, method, request_uri, headers, content, cnonce = None):
477 """Modify the request headers"""
478 H = lambda x: _md5(x).hexdigest()
479 KD = lambda s, d: H("%s:%s" % (s, d))
480 A2 = "".join([method, ":", request_uri])
481 self.challenge['cnonce'] = cnonce or _cnonce()
482 request_digest = '"%s"' % KD(H(self.A1), "%s:%s:%s:%s:%s" % (self.challenge['nonce'],
483 '%08x' % self.challenge['nc'],
484 self.challenge['cnonce'],
485 self.challenge['qop'], H(A2)
486 ))
487 headers['Authorization'] = 'Digest username="%s", realm="%s", nonce="%s", uri="%s", algorithm=%s, response=%s, qop=%s, nc=%08x, cnonce="%s"' % (
488 self.credentials[0],
489 self.challenge['realm'],
490 self.challenge['nonce'],
491 request_uri,
492 self.challenge['algorithm'],
493 request_digest,
494 self.challenge['qop'],
495 self.challenge['nc'],
496 self.challenge['cnonce'],
497 )
498 self.challenge['nc'] += 1
499
500 def response(self, response, content):
501 if not response.has_key('authentication-info'):
502 challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {})
503 if 'true' == challenge.get('stale'):
504 self.challenge['nonce'] = challenge['nonce']
505 self.challenge['nc'] = 1
506 return True
507 else:
508 updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {})
509
510 if updated_challenge.has_key('nextnonce'):
511 self.challenge['nonce'] = updated_challenge['nextnonce']
512 self.challenge['nc'] = 1
513 return False
514
515
516class HmacDigestAuthentication(Authentication):
517 """Adapted from Robert Sayre's code and DigestAuthentication above."""
518 __author__ = "Thomas Broyer (t.broyer@ltgt.net)"
519
520 def __init__(self, credentials, host, request_uri, headers, response, content, http):
521 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
522 challenge = _parse_www_authenticate(response, 'www-authenticate')
523 self.challenge = challenge['hmacdigest']
524 # TODO: self.challenge['domain']
525 self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
526 if self.challenge['reason'] not in ['unauthorized', 'integrity']:
527 self.challenge['reason'] = 'unauthorized'
528 self.challenge['salt'] = self.challenge.get('salt', '')
529 if not self.challenge.get('snonce'):
530 raise UnimplementedHmacDigestAuthOptionError( _("The challenge doesn't contain a server nonce, or this one is empty."))
531 self.challenge['algorithm'] = self.challenge.get('algorithm', 'HMAC-SHA-1')
532 if self.challenge['algorithm'] not in ['HMAC-SHA-1', 'HMAC-MD5']:
533 raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
534 self.challenge['pw-algorithm'] = self.challenge.get('pw-algorithm', 'SHA-1')
535 if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']:
536 raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm']))
537 if self.challenge['algorithm'] == 'HMAC-MD5':
538 self.hashmod = _md5
539 else:
540 self.hashmod = _sha
541 if self.challenge['pw-algorithm'] == 'MD5':
542 self.pwhashmod = _md5
543 else:
544 self.pwhashmod = _sha
545 self.key = "".join([self.credentials[0], ":",
546 self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
547 ":", self.challenge['realm']
548 ])
549 self.key = self.pwhashmod.new(self.key).hexdigest().lower()
550
551 def request(self, method, request_uri, headers, content):
552 """Modify the request headers"""
553 keys = _get_end2end_headers(headers)
554 keylist = "".join(["%s " % k for k in keys])
555 headers_val = "".join([headers[k] for k in keys])
556 created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
557 cnonce = _cnonce()
558 request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
559 request_digest = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
560 headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
561 self.credentials[0],
562 self.challenge['realm'],
563 self.challenge['snonce'],
564 cnonce,
565 request_uri,
566 created,
567 request_digest,
568 keylist,
569 )
570
571 def response(self, response, content):
572 challenge = _parse_www_authenticate(response, 'www-authenticate').get('hmacdigest', {})
573 if challenge.get('reason') in ['integrity', 'stale']:
574 return True
575 return False
576
577
578class WsseAuthentication(Authentication):
579 """This is thinly tested and should not be relied upon.
580 At this time there isn't any third party server to test against.
581 Blogger and TypePad implemented this algorithm at one point
582 but Blogger has since switched to Basic over HTTPS and
583 TypePad has implemented it wrong, by never issuing a 401
584 challenge but instead requiring your client to telepathically know that
585 their endpoint is expecting WSSE profile="UsernameToken"."""
586 def __init__(self, credentials, host, request_uri, headers, response, content, http):
587 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
588
589 def request(self, method, request_uri, headers, content):
590 """Modify the request headers to add the appropriate
591 Authorization header."""
592 headers['Authorization'] = 'WSSE profile="UsernameToken"'
593 iso_now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
594 cnonce = _cnonce()
595 password_digest = _wsse_username_token(cnonce, iso_now, self.credentials[1])
596 headers['X-WSSE'] = 'UsernameToken Username="%s", PasswordDigest="%s", Nonce="%s", Created="%s"' % (
597 self.credentials[0],
598 password_digest,
599 cnonce,
600 iso_now)
601
602class GoogleLoginAuthentication(Authentication):
603 def __init__(self, credentials, host, request_uri, headers, response, content, http):
604 from urllib import urlencode
605 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
606 challenge = _parse_www_authenticate(response, 'www-authenticate')
607 service = challenge['googlelogin'].get('service', 'xapi')
608 # Bloggger actually returns the service in the challenge
609 # For the rest we guess based on the URI
610 if service == 'xapi' and request_uri.find("calendar") > 0:
611 service = "cl"
612 # No point in guessing Base or Spreadsheet
613 #elif request_uri.find("spreadsheets") > 0:
614 # service = "wise"
615
616 auth = dict(Email=credentials[0], Passwd=credentials[1], service=service, source=headers['user-agent'])
617 resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'})
618 lines = content.split('\n')
619 d = dict([tuple(line.split("=", 1)) for line in lines if line])
620 if resp.status == 403:
621 self.Auth = ""
622 else:
623 self.Auth = d['Auth']
624
625 def request(self, method, request_uri, headers, content):
626 """Modify the request headers to add the appropriate
627 Authorization header."""
628 headers['authorization'] = 'GoogleLogin Auth=' + self.Auth
629
630
631AUTH_SCHEME_CLASSES = {
632 "basic": BasicAuthentication,
633 "wsse": WsseAuthentication,
634 "digest": DigestAuthentication,
635 "hmacdigest": HmacDigestAuthentication,
636 "googlelogin": GoogleLoginAuthentication
637}
638
639AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
640
641class FileCache(object):
642 """Uses a local directory as a store for cached files.
643 Not really safe to use if multiple threads or processes are going to
644 be running on the same cache.
645 """
646 def __init__(self, cache, safe=safename): # use safe=lambda x: md5.new(x).hexdigest() for the old behavior
647 self.cache = cache
648 self.safe = safe
649 if not os.path.exists(cache):
650 os.makedirs(self.cache)
651
652 def get(self, key):
653 retval = None
654 cacheFullPath = os.path.join(self.cache, self.safe(key))
655 try:
656 f = file(cacheFullPath, "rb")
657 retval = f.read()
658 f.close()
659 except IOError:
660 pass
661 return retval
662
663 def set(self, key, value):
664 cacheFullPath = os.path.join(self.cache, self.safe(key))
665 f = file(cacheFullPath, "wb")
666 f.write(value)
667 f.close()
668
669 def delete(self, key):
670 cacheFullPath = os.path.join(self.cache, self.safe(key))
671 if os.path.exists(cacheFullPath):
672 os.remove(cacheFullPath)
673
674class Credentials(object):
675 def __init__(self):
676 self.credentials = []
677
678 def add(self, name, password, domain=""):
679 self.credentials.append((domain.lower(), name, password))
680
681 def clear(self):
682 self.credentials = []
683
684 def iter(self, domain):
685 for (cdomain, name, password) in self.credentials:
686 if cdomain == "" or domain == cdomain:
687 yield (name, password)
688
689class KeyCerts(Credentials):
690 """Identical to Credentials except that
691 name/password are mapped to key/cert."""
692 pass
693
694
695class ProxyInfo(object):
696 """Collect information required to use a proxy."""
697 def __init__(self, proxy_type, proxy_host, proxy_port, proxy_rdns=None, proxy_user=None, proxy_pass=None):
698 """The parameter proxy_type must be set to one of socks.PROXY_TYPE_XXX
699 constants. For example:
700
701p = ProxyInfo(proxy_type=socks.PROXY_TYPE_HTTP, proxy_host='localhost', proxy_port=8000)
702 """
703 self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns, self.proxy_user, self.proxy_pass = proxy_type, proxy_host, proxy_port, proxy_rdns, proxy_user, proxy_pass
704
705 def astuple(self):
706 return (self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns,
707 self.proxy_user, self.proxy_pass)
708
709 def isgood(self):
710 return (self.proxy_host != None) and (self.proxy_port != None)
711
712
713class HTTPConnectionWithTimeout(httplib.HTTPConnection):
714 """HTTPConnection subclass that supports timeouts"""
715
716 def __init__(self, host, port=None, strict=None, timeout=None, proxy_info=None):
717 httplib.HTTPConnection.__init__(self, host, port, strict)
718 self.timeout = timeout
719 self.proxy_info = proxy_info
720
721 def connect(self):
722 """Connect to the host and port specified in __init__."""
723 # Mostly verbatim from httplib.py.
724 msg = "getaddrinfo returns an empty list"
725 for res in socket.getaddrinfo(self.host, self.port, 0,
726 socket.SOCK_STREAM):
727 af, socktype, proto, canonname, sa = res
728 try:
729 if self.proxy_info and self.proxy_info.isgood():
730 self.sock = socks.socksocket(af, socktype, proto)
731 self.sock.setproxy(*self.proxy_info.astuple())
732 else:
733 self.sock = socket.socket(af, socktype, proto)
734 self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
735 # Different from httplib: support timeouts.
736 if has_timeout(self.timeout):
737 self.sock.settimeout(self.timeout)
738 # End of difference from httplib.
739 if self.debuglevel > 0:
740 print "connect: (%s, %s)" % (self.host, self.port)
741
742 self.sock.connect(sa)
743 except socket.error, msg:
744 if self.debuglevel > 0:
745 print 'connect fail:', (self.host, self.port)
746 if self.sock:
747 self.sock.close()
748 self.sock = None
749 continue
750 break
751 if not self.sock:
752 raise socket.error, msg
753
754class HTTPSConnectionWithTimeout(httplib.HTTPSConnection):
755 "This class allows communication via SSL."
756
757 def __init__(self, host, port=None, key_file=None, cert_file=None,
758 strict=None, timeout=None, proxy_info=None):
759 httplib.HTTPSConnection.__init__(self, host, port=port, key_file=key_file,
760 cert_file=cert_file, strict=strict)
761 self.timeout = timeout
762 self.proxy_info = proxy_info
763
764 def connect(self):
765 "Connect to a host on a given (SSL) port."
766
767 if self.proxy_info and self.proxy_info.isgood():
768 sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM)
769 sock.setproxy(*self.proxy_info.astuple())
770 else:
771 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
772 sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
773
774 if has_timeout(self.timeout):
775 sock.settimeout(self.timeout)
776 sock.connect((self.host, self.port))
777 self.sock =_ssl_wrap_socket(sock, self.key_file, self.cert_file)
778
779
780
781class Http(object):
782 """An HTTP client that handles:
783- all methods
784- caching
785- ETags
786- compression,
787- HTTPS
788- Basic
789- Digest
790- WSSE
791
792and more.
793 """
794 def __init__(self, cache=None, timeout=None, proxy_info=None):
795 """The value of proxy_info is a ProxyInfo instance.
796
797If 'cache' is a string then it is used as a directory name
798for a disk cache. Otherwise it must be an object that supports
799the same interface as FileCache."""
800 self.proxy_info = proxy_info
801 # Map domain name to an httplib connection
802 self.connections = {}
803 # The location of the cache, for now a directory
804 # where cached responses are held.
805 if cache and isinstance(cache, str):
806 self.cache = FileCache(cache)
807 else:
808 self.cache = cache
809
810 # Name/password
811 self.credentials = Credentials()
812
813 # Key/cert
814 self.certificates = KeyCerts()
815
816 # authorization objects
817 self.authorizations = []
818
819 # If set to False then no redirects are followed, even safe ones.
820 self.follow_redirects = True
821
822 # Which HTTP methods do we apply optimistic concurrency to, i.e.
823 # which methods get an "if-match:" etag header added to them.
824 self.optimistic_concurrency_methods = ["PUT"]
825
826 # If 'follow_redirects' is True, and this is set to True then
827 # all redirecs are followed, including unsafe ones.
828 self.follow_all_redirects = False
829
830 self.ignore_etag = False
831
832 self.force_exception_to_status_code = False
833
834 self.timeout = timeout
835
836 def _auth_from_challenge(self, host, request_uri, headers, response, content):
837 """A generator that creates Authorization objects
838 that can be applied to requests.
839 """
840 challenges = _parse_www_authenticate(response, 'www-authenticate')
841 for cred in self.credentials.iter(host):
842 for scheme in AUTH_SCHEME_ORDER:
843 if challenges.has_key(scheme):
844 yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self)
845
846 def add_credentials(self, name, password, domain=""):
847 """Add a name and password that will be used
848 any time a request requires authentication."""
849 self.credentials.add(name, password, domain)
850
851 def add_certificate(self, key, cert, domain):
852 """Add a key and cert that will be used
853 any time a request requires authentication."""
854 self.certificates.add(key, cert, domain)
855
856 def clear_credentials(self):
857 """Remove all the names and passwords
858 that are used for authentication"""
859 self.credentials.clear()
860 self.authorizations = []
861
862 def _conn_request(self, conn, request_uri, method, body, headers):
863 for i in range(2):
864 try:
865 conn.request(method, request_uri, body, headers)
866 except socket.gaierror:
867 conn.close()
868 raise ServerNotFoundError("Unable to find the server at %s" % conn.host)
869 except socket.error, e:
870 if e.errno == errno.ECONNREFUSED: # Connection refused
871 raise
872 except httplib.HTTPException:
873 # Just because the server closed the connection doesn't apparently mean
874 # that the server didn't send a response.
875 pass
876 try:
877 response = conn.getresponse()
878 except (socket.error, httplib.HTTPException):
879 if i == 0:
880 conn.close()
881 conn.connect()
882 continue
883 else:
884 raise
885 else:
886 content = ""
887 if method == "HEAD":
888 response.close()
889 else:
890 content = response.read()
891 response = Response(response)
892 if method != "HEAD":
893 content = _decompressContent(response, content)
894 break
895 return (response, content)
896
897
898 def _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey):
899 """Do the actual request using the connection object
900 and also follow one level of redirects if necessary"""
901
902 auths = [(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]
903 auth = auths and sorted(auths)[0][1] or None
904 if auth:
905 auth.request(method, request_uri, headers, body)
906
907 (response, content) = self._conn_request(conn, request_uri, method, body, headers)
908
909 if auth:
910 if auth.response(response, body):
911 auth.request(method, request_uri, headers, body)
912 (response, content) = self._conn_request(conn, request_uri, method, body, headers )
913 response._stale_digest = 1
914
915 if response.status == 401:
916 for authorization in self._auth_from_challenge(host, request_uri, headers, response, content):
917 authorization.request(method, request_uri, headers, body)
918 (response, content) = self._conn_request(conn, request_uri, method, body, headers, )
919 if response.status != 401:
920 self.authorizations.append(authorization)
921 authorization.response(response, body)
922 break
923
924 if (self.follow_all_redirects or (method in ["GET", "HEAD"]) or response.status == 303):
925 if self.follow_redirects and response.status in [300, 301, 302, 303, 307]:
926 # Pick out the location header and basically start from the beginning
927 # remembering first to strip the ETag header and decrement our 'depth'
928 if redirections:
929 if not response.has_key('location') and response.status != 300:
930 raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."), response, content)
931 # Fix-up relative redirects (which violate an RFC 2616 MUST)
932 if response.has_key('location'):
933 location = response['location']
934 (scheme, authority, path, query, fragment) = parse_uri(location)
935 if authority == None:
936 response['location'] = urlparse.urljoin(absolute_uri, location)
937 if response.status == 301 and method in ["GET", "HEAD"]:
938 response['-x-permanent-redirect-url'] = response['location']
939 if not response.has_key('content-location'):
940 response['content-location'] = absolute_uri
941 _updateCache(headers, response, content, self.cache, cachekey)
942 if headers.has_key('if-none-match'):
943 del headers['if-none-match']
944 if headers.has_key('if-modified-since'):
945 del headers['if-modified-since']
946 if response.has_key('location'):
947 location = response['location']
948 old_response = copy.deepcopy(response)
949 if not old_response.has_key('content-location'):
950 old_response['content-location'] = absolute_uri
951 redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
952 (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
953 response.previous = old_response
954 else:
955 raise RedirectLimit( _("Redirected more times than rediection_limit allows."), response, content)
956 elif response.status in [200, 203] and method == "GET":
957 # Don't cache 206's since we aren't going to handle byte range requests
958 if not response.has_key('content-location'):
959 response['content-location'] = absolute_uri
960 _updateCache(headers, response, content, self.cache, cachekey)
961
962 return (response, content)
963
964 def _normalize_headers(self, headers):
965 return _normalize_headers(headers)
966
967# Need to catch and rebrand some exceptions
968# Then need to optionally turn all exceptions into status codes
969# including all socket.* and httplib.* exceptions.
970
971
972 def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None):
973 """ Performs a single HTTP request.
974The 'uri' is the URI of the HTTP resource and can begin
975with either 'http' or 'https'. The value of 'uri' must be an absolute URI.
976
977The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc.
978There is no restriction on the methods allowed.
979
980The 'body' is the entity body to be sent with the request. It is a string
981object.
982
983Any extra headers that are to be sent with the request should be provided in the
984'headers' dictionary.
985
986The maximum number of redirect to follow before raising an
987exception is 'redirections. The default is 5.
988
989The return value is a tuple of (response, content), the first
990being and instance of the 'Response' class, the second being
991a string that contains the response entity body.
992 """
993 try:
994 if headers is None:
995 headers = {}
996 else:
997 headers = self._normalize_headers(headers)
998
999 if not headers.has_key('user-agent'):
1000 headers['user-agent'] = "Python-httplib2/%s" % __version__
1001
1002 uri = iri2uri(uri)
1003
1004 (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
1005 domain_port = authority.split(":")[0:2]
1006 if len(domain_port) == 2 and domain_port[1] == '443' and scheme == 'http':
1007 scheme = 'https'
1008 authority = domain_port[0]
1009
1010 conn_key = scheme+":"+authority
1011 if conn_key in self.connections:
1012 conn = self.connections[conn_key]
1013 else:
1014 if not connection_type:
1015 connection_type = (scheme == 'https') and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout
1016 certs = list(self.certificates.iter(authority))
1017 if scheme == 'https' and certs:
1018 conn = self.connections[conn_key] = connection_type(authority, key_file=certs[0][0],
1019 cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info)
1020 else:
1021 conn = self.connections[conn_key] = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info)
1022 conn.set_debuglevel(debuglevel)
1023
1024 if method in ["GET", "HEAD"] and 'range' not in headers and 'accept-encoding' not in headers:
1025 headers['accept-encoding'] = 'gzip, deflate'
1026
1027 info = email.Message.Message()
1028 cached_value = None
1029 if self.cache:
1030 cachekey = defrag_uri
1031 cached_value = self.cache.get(cachekey)
1032 if cached_value:
1033 # info = email.message_from_string(cached_value)
1034 #
1035 # Need to replace the line above with the kludge below
1036 # to fix the non-existent bug not fixed in this
1037 # bug report: http://mail.python.org/pipermail/python-bugs-list/2005-September/030289.html
1038 try:
1039 info, content = cached_value.split('\r\n\r\n', 1)
1040 feedparser = email.FeedParser.FeedParser()
1041 feedparser.feed(info)
1042 info = feedparser.close()
1043 feedparser._parse = None
1044 except IndexError:
1045 self.cache.delete(cachekey)
1046 cachekey = None
1047 cached_value = None
1048 else:
1049 cachekey = None
1050
1051 if method in self.optimistic_concurrency_methods and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers:
1052 # http://www.w3.org/1999/04/Editing/
1053 headers['if-match'] = info['etag']
1054
1055 if method not in ["GET", "HEAD"] and self.cache and cachekey:
1056 # RFC 2616 Section 13.10
1057 self.cache.delete(cachekey)
1058
1059 # Check the vary header in the cache to see if this request
1060 # matches what varies in the cache.
1061 if method in ['GET', 'HEAD'] and 'vary' in info:
1062 vary = info['vary']
1063 vary_headers = vary.lower().replace(' ', '').split(',')
1064 for header in vary_headers:
1065 key = '-varied-%s' % header
1066 value = info[key]
1067 if headers.get(header, None) != value:
1068 cached_value = None
1069 break
1070
1071 if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
1072 if info.has_key('-x-permanent-redirect-url'):
1073 # Should cached permanent redirects be counted in our redirection count? For now, yes.
1074 (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
1075 response.previous = Response(info)
1076 response.previous.fromcache = True
1077 else:
1078 # Determine our course of action:
1079 # Is the cached entry fresh or stale?
1080 # Has the client requested a non-cached response?
1081 #
1082 # There seems to be three possible answers:
1083 # 1. [FRESH] Return the cache entry w/o doing a GET
1084 # 2. [STALE] Do the GET (but add in cache validators if available)
1085 # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
1086 entry_disposition = _entry_disposition(info, headers)
1087
1088 if entry_disposition == "FRESH":
1089 if not cached_value:
1090 info['status'] = '504'
1091 content = ""
1092 response = Response(info)
1093 if cached_value:
1094 response.fromcache = True
1095 return (response, content)
1096
1097 if entry_disposition == "STALE":
1098 if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers:
1099 headers['if-none-match'] = info['etag']
1100 if info.has_key('last-modified') and not 'last-modified' in headers:
1101 headers['if-modified-since'] = info['last-modified']
1102 elif entry_disposition == "TRANSPARENT":
1103 pass
1104
1105 (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1106
1107 if response.status == 304 and method == "GET":
1108 # Rewrite the cache entry with the new end-to-end headers
1109 # Take all headers that are in response
1110 # and overwrite their values in info.
1111 # unless they are hop-by-hop, or are listed in the connection header.
1112
1113 for key in _get_end2end_headers(response):
1114 info[key] = response[key]
1115 merged_response = Response(info)
1116 if hasattr(response, "_stale_digest"):
1117 merged_response._stale_digest = response._stale_digest
1118 _updateCache(headers, merged_response, content, self.cache, cachekey)
1119 response = merged_response
1120 response.status = 200
1121 response.fromcache = True
1122
1123 elif response.status == 200:
1124 content = new_content
1125 else:
1126 self.cache.delete(cachekey)
1127 content = new_content
1128 else:
1129 cc = _parse_cache_control(headers)
1130 if cc.has_key('only-if-cached'):
1131 info['status'] = '504'
1132 response = Response(info)
1133 content = ""
1134 else:
1135 (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1136 except Exception, e:
1137 if self.force_exception_to_status_code:
1138 if isinstance(e, HttpLib2ErrorWithResponse):
1139 response = e.response
1140 content = e.content
1141 response.status = 500
1142 response.reason = str(e)
1143 elif isinstance(e, socket.timeout):
1144 content = "Request Timeout"
1145 response = Response( {
1146 "content-type": "text/plain",
1147 "status": "408",
1148 "content-length": len(content)
1149 })
1150 response.reason = "Request Timeout"
1151 else:
1152 content = str(e)
1153 response = Response( {
1154 "content-type": "text/plain",
1155 "status": "400",
1156 "content-length": len(content)
1157 })
1158 response.reason = "Bad Request"
1159 else:
1160 raise
1161
1162
1163 return (response, content)
1164
1165
1166
1167class Response(dict):
1168 """An object more like email.Message than httplib.HTTPResponse."""
1169
1170 """Is this response from our local cache"""
1171 fromcache = False
1172
1173 """HTTP protocol version used by server. 10 for HTTP/1.0, 11 for HTTP/1.1. """
1174 version = 11
1175
1176 "Status code returned by server. "
1177 status = 200
1178
1179 """Reason phrase returned by server."""
1180 reason = "Ok"
1181
1182 previous = None
1183
1184 def __init__(self, info):
1185 # info is either an email.Message or
1186 # an httplib.HTTPResponse object.
1187 if isinstance(info, httplib.HTTPResponse):
1188 for key, value in info.getheaders():
1189 self[key.lower()] = value
1190 self.status = info.status
1191 self['status'] = str(self.status)
1192 self.reason = info.reason
1193 self.version = info.version
1194 elif isinstance(info, email.Message.Message):
1195 for key, value in info.items():
1196 self[key] = value
1197 self.status = int(self['status'])
1198 else:
1199 for key, value in info.iteritems():
1200 self[key] = value
1201 self.status = int(self.get('status', self.status))
1202
1203
1204 def __getattr__(self, name):
1205 if name == 'dict':
1206 return self
1207 else:
1208 raise AttributeError, name