blob: 567e24e48fe67836746ae7820d3062d3703d0e3b [file] [log] [blame]
Joe Gregorio845a5452010-09-08 13:50:34 -04001from __future__ import generators
2"""
3httplib2
4
5A caching http interface that supports ETags and gzip
6to conserve bandwidth.
7
8Requires Python 2.3 or later
9
10Changelog:
112007-08-18, Rick: Modified so it's able to use a socks proxy if needed.
12
13"""
14
15__author__ = "Joe Gregorio (joe@bitworking.org)"
16__copyright__ = "Copyright 2006, Joe Gregorio"
17__contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
18 "James Antill",
19 "Xavier Verges Farrero",
20 "Jonathan Feinberg",
21 "Blair Zajac",
22 "Sam Ruby",
23 "Louis Nyffenegger"]
24__license__ = "MIT"
25__version__ = "$Rev$"
26
27import re
28import sys
29import email
30import email.Utils
31import email.Message
32import email.FeedParser
33import StringIO
34import gzip
35import zlib
36import httplib
37import urlparse
38import base64
39import os
40import copy
41import calendar
42import time
43import random
44import errno
45# remove depracated warning in python2.6
46try:
47 from hashlib import sha1 as _sha, md5 as _md5
48except ImportError:
49 import sha
50 import md5
51 _sha = sha.new
52 _md5 = md5.new
53import hmac
54from gettext import gettext as _
55import socket
56
57try:
Joe Gregorio5e3a5fa2010-10-11 13:03:56 -040058 from httplib2 import socks
Joe Gregorio845a5452010-09-08 13:50:34 -040059except ImportError:
Joe Gregorio5e3a5fa2010-10-11 13:03:56 -040060 socks = None
Joe Gregorio845a5452010-09-08 13:50:34 -040061
62# Build the appropriate socket wrapper for ssl
63try:
64 import ssl # python 2.6
65 _ssl_wrap_socket = ssl.wrap_socket
66except ImportError:
67 def _ssl_wrap_socket(sock, key_file, cert_file):
68 ssl_sock = socket.ssl(sock, key_file, cert_file)
69 return httplib.FakeSocket(sock, ssl_sock)
70
71
72if sys.version_info >= (2,3):
73 from iri2uri import iri2uri
74else:
75 def iri2uri(uri):
76 return uri
77
78def has_timeout(timeout): # python 2.6
79 if hasattr(socket, '_GLOBAL_DEFAULT_TIMEOUT'):
80 return (timeout is not None and timeout is not socket._GLOBAL_DEFAULT_TIMEOUT)
81 return (timeout is not None)
82
83__all__ = ['Http', 'Response', 'ProxyInfo', 'HttpLib2Error',
84 'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent',
85 'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
Joe Gregorio5e3a5fa2010-10-11 13:03:56 -040086 'debuglevel', 'ProxiesUnavailableError']
Joe Gregorio845a5452010-09-08 13:50:34 -040087
88
89# The httplib debug level, set to a non-zero value to get debug output
90debuglevel = 0
91
92
93# Python 2.3 support
94if sys.version_info < (2,4):
95 def sorted(seq):
96 seq.sort()
97 return seq
98
99# Python 2.3 support
100def HTTPResponse__getheaders(self):
101 """Return list of (header, value) tuples."""
102 if self.msg is None:
103 raise httplib.ResponseNotReady()
104 return self.msg.items()
105
106if not hasattr(httplib.HTTPResponse, 'getheaders'):
107 httplib.HTTPResponse.getheaders = HTTPResponse__getheaders
108
109# All exceptions raised here derive from HttpLib2Error
110class HttpLib2Error(Exception): pass
111
112# Some exceptions can be caught and optionally
113# be turned back into responses.
114class HttpLib2ErrorWithResponse(HttpLib2Error):
115 def __init__(self, desc, response, content):
116 self.response = response
117 self.content = content
118 HttpLib2Error.__init__(self, desc)
119
120class RedirectMissingLocation(HttpLib2ErrorWithResponse): pass
121class RedirectLimit(HttpLib2ErrorWithResponse): pass
122class FailedToDecompressContent(HttpLib2ErrorWithResponse): pass
123class UnimplementedDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
124class UnimplementedHmacDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
125
126class RelativeURIError(HttpLib2Error): pass
127class ServerNotFoundError(HttpLib2Error): pass
Joe Gregorio5e3a5fa2010-10-11 13:03:56 -0400128class ProxiesUnavailableError(HttpLib2Error): pass
Joe Gregorio845a5452010-09-08 13:50:34 -0400129
130# Open Items:
131# -----------
132# Proxy support
133
134# Are we removing the cached content too soon on PUT (only delete on 200 Maybe?)
135
136# Pluggable cache storage (supports storing the cache in
137# flat files by default. We need a plug-in architecture
138# that can support Berkeley DB and Squid)
139
140# == Known Issues ==
141# Does not handle a resource that uses conneg and Last-Modified but no ETag as a cache validator.
142# Does not handle Cache-Control: max-stale
143# Does not use Age: headers when calculating cache freshness.
144
145
146# The number of redirections to follow before giving up.
147# Note that only GET redirects are automatically followed.
148# Will also honor 301 requests by saving that info and never
149# requesting that URI again.
150DEFAULT_MAX_REDIRECTS = 5
151
152# Which headers are hop-by-hop headers by default
153HOP_BY_HOP = ['connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade']
154
155def _get_end2end_headers(response):
156 hopbyhop = list(HOP_BY_HOP)
157 hopbyhop.extend([x.strip() for x in response.get('connection', '').split(',')])
158 return [header for header in response.keys() if header not in hopbyhop]
159
160URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
161
162def parse_uri(uri):
163 """Parses a URI using the regex given in Appendix B of RFC 3986.
164
165 (scheme, authority, path, query, fragment) = parse_uri(uri)
166 """
167 groups = URI.match(uri).groups()
168 return (groups[1], groups[3], groups[4], groups[6], groups[8])
169
170def urlnorm(uri):
171 (scheme, authority, path, query, fragment) = parse_uri(uri)
172 if not scheme or not authority:
173 raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri)
174 authority = authority.lower()
175 scheme = scheme.lower()
176 if not path:
177 path = "/"
178 # Could do syntax based normalization of the URI before
179 # computing the digest. See Section 6.2.2 of Std 66.
180 request_uri = query and "?".join([path, query]) or path
181 scheme = scheme.lower()
182 defrag_uri = scheme + "://" + authority + request_uri
183 return scheme, authority, request_uri, defrag_uri
184
185
186# Cache filename construction (original borrowed from Venus http://intertwingly.net/code/venus/)
187re_url_scheme = re.compile(r'^\w+://')
188re_slash = re.compile(r'[?/:|]+')
189
190def safename(filename):
191 """Return a filename suitable for the cache.
192
193 Strips dangerous and common characters to create a filename we
194 can use to store the cache in.
195 """
196
197 try:
198 if re_url_scheme.match(filename):
199 if isinstance(filename,str):
200 filename = filename.decode('utf-8')
201 filename = filename.encode('idna')
202 else:
203 filename = filename.encode('idna')
204 except UnicodeError:
205 pass
206 if isinstance(filename,unicode):
207 filename=filename.encode('utf-8')
208 filemd5 = _md5(filename).hexdigest()
209 filename = re_url_scheme.sub("", filename)
210 filename = re_slash.sub(",", filename)
211
212 # limit length of filename
213 if len(filename)>200:
214 filename=filename[:200]
215 return ",".join((filename, filemd5))
216
217NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
218def _normalize_headers(headers):
219 return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip()) for (key, value) in headers.iteritems()])
220
221def _parse_cache_control(headers):
222 retval = {}
223 if headers.has_key('cache-control'):
224 parts = headers['cache-control'].split(',')
225 parts_with_args = [tuple([x.strip().lower() for x in part.split("=", 1)]) for part in parts if -1 != part.find("=")]
226 parts_wo_args = [(name.strip().lower(), 1) for name in parts if -1 == name.find("=")]
227 retval = dict(parts_with_args + parts_wo_args)
228 return retval
229
230# Whether to use a strict mode to parse WWW-Authenticate headers
231# Might lead to bad results in case of ill-formed header value,
232# so disabled by default, falling back to relaxed parsing.
233# Set to true to turn on, usefull for testing servers.
234USE_WWW_AUTH_STRICT_PARSING = 0
235
236# In regex below:
237# [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+ matches a "token" as defined by HTTP
238# "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?" matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space
239# Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both:
240# \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?
241WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$")
242WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$")
243UNQUOTE_PAIRS = re.compile(r'\\(.)')
244def _parse_www_authenticate(headers, headername='www-authenticate'):
245 """Returns a dictionary of dictionaries, one dict
246 per auth_scheme."""
247 retval = {}
248 if headers.has_key(headername):
249 authenticate = headers[headername].strip()
250 www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED
251 while authenticate:
252 # Break off the scheme at the beginning of the line
253 if headername == 'authentication-info':
254 (auth_scheme, the_rest) = ('digest', authenticate)
255 else:
256 (auth_scheme, the_rest) = authenticate.split(" ", 1)
257 # Now loop over all the key value pairs that come after the scheme,
258 # being careful not to roll into the next scheme
259 match = www_auth.search(the_rest)
260 auth_params = {}
261 while match:
262 if match and len(match.groups()) == 3:
263 (key, value, the_rest) = match.groups()
264 auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')])
265 match = www_auth.search(the_rest)
266 retval[auth_scheme.lower()] = auth_params
267 authenticate = the_rest.strip()
268 return retval
269
270
271def _entry_disposition(response_headers, request_headers):
272 """Determine freshness from the Date, Expires and Cache-Control headers.
273
274 We don't handle the following:
275
276 1. Cache-Control: max-stale
277 2. Age: headers are not used in the calculations.
278
279 Not that this algorithm is simpler than you might think
280 because we are operating as a private (non-shared) cache.
281 This lets us ignore 's-maxage'. We can also ignore
282 'proxy-invalidate' since we aren't a proxy.
283 We will never return a stale document as
284 fresh as a design decision, and thus the non-implementation
285 of 'max-stale'. This also lets us safely ignore 'must-revalidate'
286 since we operate as if every server has sent 'must-revalidate'.
287 Since we are private we get to ignore both 'public' and
288 'private' parameters. We also ignore 'no-transform' since
289 we don't do any transformations.
290 The 'no-store' parameter is handled at a higher level.
291 So the only Cache-Control parameters we look at are:
292
293 no-cache
294 only-if-cached
295 max-age
296 min-fresh
297 """
298
299 retval = "STALE"
300 cc = _parse_cache_control(request_headers)
301 cc_response = _parse_cache_control(response_headers)
302
303 if request_headers.has_key('pragma') and request_headers['pragma'].lower().find('no-cache') != -1:
304 retval = "TRANSPARENT"
305 if 'cache-control' not in request_headers:
306 request_headers['cache-control'] = 'no-cache'
307 elif cc.has_key('no-cache'):
308 retval = "TRANSPARENT"
309 elif cc_response.has_key('no-cache'):
310 retval = "STALE"
311 elif cc.has_key('only-if-cached'):
312 retval = "FRESH"
313 elif response_headers.has_key('date'):
314 date = calendar.timegm(email.Utils.parsedate_tz(response_headers['date']))
315 now = time.time()
316 current_age = max(0, now - date)
317 if cc_response.has_key('max-age'):
318 try:
319 freshness_lifetime = int(cc_response['max-age'])
320 except ValueError:
321 freshness_lifetime = 0
322 elif response_headers.has_key('expires'):
323 expires = email.Utils.parsedate_tz(response_headers['expires'])
324 if None == expires:
325 freshness_lifetime = 0
326 else:
327 freshness_lifetime = max(0, calendar.timegm(expires) - date)
328 else:
329 freshness_lifetime = 0
330 if cc.has_key('max-age'):
331 try:
332 freshness_lifetime = int(cc['max-age'])
333 except ValueError:
334 freshness_lifetime = 0
335 if cc.has_key('min-fresh'):
336 try:
337 min_fresh = int(cc['min-fresh'])
338 except ValueError:
339 min_fresh = 0
340 current_age += min_fresh
341 if freshness_lifetime > current_age:
342 retval = "FRESH"
343 return retval
344
345def _decompressContent(response, new_content):
346 content = new_content
347 try:
348 encoding = response.get('content-encoding', None)
349 if encoding in ['gzip', 'deflate']:
350 if encoding == 'gzip':
351 content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
352 if encoding == 'deflate':
353 content = zlib.decompress(content)
354 response['content-length'] = str(len(content))
355 # Record the historical presence of the encoding in a way the won't interfere.
356 response['-content-encoding'] = response['content-encoding']
357 del response['content-encoding']
358 except IOError:
359 content = ""
360 raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content)
361 return content
362
363def _updateCache(request_headers, response_headers, content, cache, cachekey):
364 if cachekey:
365 cc = _parse_cache_control(request_headers)
366 cc_response = _parse_cache_control(response_headers)
367 if cc.has_key('no-store') or cc_response.has_key('no-store'):
368 cache.delete(cachekey)
369 else:
370 info = email.Message.Message()
371 for key, value in response_headers.iteritems():
372 if key not in ['status','content-encoding','transfer-encoding']:
373 info[key] = value
374
375 # Add annotations to the cache to indicate what headers
376 # are variant for this request.
377 vary = response_headers.get('vary', None)
378 if vary:
379 vary_headers = vary.lower().replace(' ', '').split(',')
380 for header in vary_headers:
381 key = '-varied-%s' % header
382 try:
383 info[key] = request_headers[header]
384 except KeyError:
385 pass
386
387 status = response_headers.status
388 if status == 304:
389 status = 200
390
391 status_header = 'status: %d\r\n' % response_headers.status
392
393 header_str = info.as_string()
394
395 header_str = re.sub("\r(?!\n)|(?<!\r)\n", "\r\n", header_str)
396 text = "".join([status_header, header_str, content])
397
398 cache.set(cachekey, text)
399
400def _cnonce():
401 dig = _md5("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
402 return dig[:16]
403
404def _wsse_username_token(cnonce, iso_now, password):
405 return base64.b64encode(_sha("%s%s%s" % (cnonce, iso_now, password)).digest()).strip()
406
407
408# For credentials we need two things, first
409# a pool of credential to try (not necesarily tied to BAsic, Digest, etc.)
410# Then we also need a list of URIs that have already demanded authentication
411# That list is tricky since sub-URIs can take the same auth, or the
412# auth scheme may change as you descend the tree.
413# So we also need each Auth instance to be able to tell us
414# how close to the 'top' it is.
415
416class Authentication(object):
417 def __init__(self, credentials, host, request_uri, headers, response, content, http):
418 (scheme, authority, path, query, fragment) = parse_uri(request_uri)
419 self.path = path
420 self.host = host
421 self.credentials = credentials
422 self.http = http
423
424 def depth(self, request_uri):
425 (scheme, authority, path, query, fragment) = parse_uri(request_uri)
426 return request_uri[len(self.path):].count("/")
427
428 def inscope(self, host, request_uri):
429 # XXX Should we normalize the request_uri?
430 (scheme, authority, path, query, fragment) = parse_uri(request_uri)
431 return (host == self.host) and path.startswith(self.path)
432
433 def request(self, method, request_uri, headers, content):
434 """Modify the request headers to add the appropriate
435 Authorization header. Over-rise this in sub-classes."""
436 pass
437
438 def response(self, response, content):
439 """Gives us a chance to update with new nonces
440 or such returned from the last authorized response.
441 Over-rise this in sub-classes if necessary.
442
443 Return TRUE is the request is to be retried, for
444 example Digest may return stale=true.
445 """
446 return False
447
448
449
450class BasicAuthentication(Authentication):
451 def __init__(self, credentials, host, request_uri, headers, response, content, http):
452 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
453
454 def request(self, method, request_uri, headers, content):
455 """Modify the request headers to add the appropriate
456 Authorization header."""
457 headers['authorization'] = 'Basic ' + base64.b64encode("%s:%s" % self.credentials).strip()
458
459
460class DigestAuthentication(Authentication):
461 """Only do qop='auth' and MD5, since that
462 is all Apache currently implements"""
463 def __init__(self, credentials, host, request_uri, headers, response, content, http):
464 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
465 challenge = _parse_www_authenticate(response, 'www-authenticate')
466 self.challenge = challenge['digest']
467 qop = self.challenge.get('qop', 'auth')
468 self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None
469 if self.challenge['qop'] is None:
470 raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop))
471 self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5').upper()
472 if self.challenge['algorithm'] != 'MD5':
473 raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
474 self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]])
475 self.challenge['nc'] = 1
476
477 def request(self, method, request_uri, headers, content, cnonce = None):
478 """Modify the request headers"""
479 H = lambda x: _md5(x).hexdigest()
480 KD = lambda s, d: H("%s:%s" % (s, d))
481 A2 = "".join([method, ":", request_uri])
482 self.challenge['cnonce'] = cnonce or _cnonce()
483 request_digest = '"%s"' % KD(H(self.A1), "%s:%s:%s:%s:%s" % (self.challenge['nonce'],
484 '%08x' % self.challenge['nc'],
485 self.challenge['cnonce'],
486 self.challenge['qop'], H(A2)
487 ))
488 headers['Authorization'] = 'Digest username="%s", realm="%s", nonce="%s", uri="%s", algorithm=%s, response=%s, qop=%s, nc=%08x, cnonce="%s"' % (
489 self.credentials[0],
490 self.challenge['realm'],
491 self.challenge['nonce'],
492 request_uri,
493 self.challenge['algorithm'],
494 request_digest,
495 self.challenge['qop'],
496 self.challenge['nc'],
497 self.challenge['cnonce'],
498 )
499 self.challenge['nc'] += 1
500
501 def response(self, response, content):
502 if not response.has_key('authentication-info'):
503 challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {})
504 if 'true' == challenge.get('stale'):
505 self.challenge['nonce'] = challenge['nonce']
506 self.challenge['nc'] = 1
507 return True
508 else:
509 updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {})
510
511 if updated_challenge.has_key('nextnonce'):
512 self.challenge['nonce'] = updated_challenge['nextnonce']
513 self.challenge['nc'] = 1
514 return False
515
516
517class HmacDigestAuthentication(Authentication):
518 """Adapted from Robert Sayre's code and DigestAuthentication above."""
519 __author__ = "Thomas Broyer (t.broyer@ltgt.net)"
520
521 def __init__(self, credentials, host, request_uri, headers, response, content, http):
522 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
523 challenge = _parse_www_authenticate(response, 'www-authenticate')
524 self.challenge = challenge['hmacdigest']
525 # TODO: self.challenge['domain']
526 self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
527 if self.challenge['reason'] not in ['unauthorized', 'integrity']:
528 self.challenge['reason'] = 'unauthorized'
529 self.challenge['salt'] = self.challenge.get('salt', '')
530 if not self.challenge.get('snonce'):
531 raise UnimplementedHmacDigestAuthOptionError( _("The challenge doesn't contain a server nonce, or this one is empty."))
532 self.challenge['algorithm'] = self.challenge.get('algorithm', 'HMAC-SHA-1')
533 if self.challenge['algorithm'] not in ['HMAC-SHA-1', 'HMAC-MD5']:
534 raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
535 self.challenge['pw-algorithm'] = self.challenge.get('pw-algorithm', 'SHA-1')
536 if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']:
537 raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm']))
538 if self.challenge['algorithm'] == 'HMAC-MD5':
539 self.hashmod = _md5
540 else:
541 self.hashmod = _sha
542 if self.challenge['pw-algorithm'] == 'MD5':
543 self.pwhashmod = _md5
544 else:
545 self.pwhashmod = _sha
546 self.key = "".join([self.credentials[0], ":",
547 self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
548 ":", self.challenge['realm']
549 ])
550 self.key = self.pwhashmod.new(self.key).hexdigest().lower()
551
552 def request(self, method, request_uri, headers, content):
553 """Modify the request headers"""
554 keys = _get_end2end_headers(headers)
555 keylist = "".join(["%s " % k for k in keys])
556 headers_val = "".join([headers[k] for k in keys])
557 created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
558 cnonce = _cnonce()
559 request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
560 request_digest = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
561 headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
562 self.credentials[0],
563 self.challenge['realm'],
564 self.challenge['snonce'],
565 cnonce,
566 request_uri,
567 created,
568 request_digest,
569 keylist,
570 )
571
572 def response(self, response, content):
573 challenge = _parse_www_authenticate(response, 'www-authenticate').get('hmacdigest', {})
574 if challenge.get('reason') in ['integrity', 'stale']:
575 return True
576 return False
577
578
579class WsseAuthentication(Authentication):
580 """This is thinly tested and should not be relied upon.
581 At this time there isn't any third party server to test against.
582 Blogger and TypePad implemented this algorithm at one point
583 but Blogger has since switched to Basic over HTTPS and
584 TypePad has implemented it wrong, by never issuing a 401
585 challenge but instead requiring your client to telepathically know that
586 their endpoint is expecting WSSE profile="UsernameToken"."""
587 def __init__(self, credentials, host, request_uri, headers, response, content, http):
588 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
589
590 def request(self, method, request_uri, headers, content):
591 """Modify the request headers to add the appropriate
592 Authorization header."""
593 headers['Authorization'] = 'WSSE profile="UsernameToken"'
594 iso_now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
595 cnonce = _cnonce()
596 password_digest = _wsse_username_token(cnonce, iso_now, self.credentials[1])
597 headers['X-WSSE'] = 'UsernameToken Username="%s", PasswordDigest="%s", Nonce="%s", Created="%s"' % (
598 self.credentials[0],
599 password_digest,
600 cnonce,
601 iso_now)
602
603class GoogleLoginAuthentication(Authentication):
604 def __init__(self, credentials, host, request_uri, headers, response, content, http):
605 from urllib import urlencode
606 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
607 challenge = _parse_www_authenticate(response, 'www-authenticate')
608 service = challenge['googlelogin'].get('service', 'xapi')
609 # Bloggger actually returns the service in the challenge
610 # For the rest we guess based on the URI
611 if service == 'xapi' and request_uri.find("calendar") > 0:
612 service = "cl"
613 # No point in guessing Base or Spreadsheet
614 #elif request_uri.find("spreadsheets") > 0:
615 # service = "wise"
616
617 auth = dict(Email=credentials[0], Passwd=credentials[1], service=service, source=headers['user-agent'])
618 resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'})
619 lines = content.split('\n')
620 d = dict([tuple(line.split("=", 1)) for line in lines if line])
621 if resp.status == 403:
622 self.Auth = ""
623 else:
624 self.Auth = d['Auth']
625
626 def request(self, method, request_uri, headers, content):
627 """Modify the request headers to add the appropriate
628 Authorization header."""
629 headers['authorization'] = 'GoogleLogin Auth=' + self.Auth
630
631
632AUTH_SCHEME_CLASSES = {
633 "basic": BasicAuthentication,
634 "wsse": WsseAuthentication,
635 "digest": DigestAuthentication,
636 "hmacdigest": HmacDigestAuthentication,
637 "googlelogin": GoogleLoginAuthentication
638}
639
640AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
641
642class FileCache(object):
643 """Uses a local directory as a store for cached files.
644 Not really safe to use if multiple threads or processes are going to
645 be running on the same cache.
646 """
647 def __init__(self, cache, safe=safename): # use safe=lambda x: md5.new(x).hexdigest() for the old behavior
648 self.cache = cache
649 self.safe = safe
650 if not os.path.exists(cache):
651 os.makedirs(self.cache)
652
653 def get(self, key):
654 retval = None
655 cacheFullPath = os.path.join(self.cache, self.safe(key))
656 try:
657 f = file(cacheFullPath, "rb")
658 retval = f.read()
659 f.close()
660 except IOError:
661 pass
662 return retval
663
664 def set(self, key, value):
665 cacheFullPath = os.path.join(self.cache, self.safe(key))
666 f = file(cacheFullPath, "wb")
667 f.write(value)
668 f.close()
669
670 def delete(self, key):
671 cacheFullPath = os.path.join(self.cache, self.safe(key))
672 if os.path.exists(cacheFullPath):
673 os.remove(cacheFullPath)
674
675class Credentials(object):
676 def __init__(self):
677 self.credentials = []
678
679 def add(self, name, password, domain=""):
680 self.credentials.append((domain.lower(), name, password))
681
682 def clear(self):
683 self.credentials = []
684
685 def iter(self, domain):
686 for (cdomain, name, password) in self.credentials:
687 if cdomain == "" or domain == cdomain:
688 yield (name, password)
689
690class KeyCerts(Credentials):
691 """Identical to Credentials except that
692 name/password are mapped to key/cert."""
693 pass
694
695
696class ProxyInfo(object):
697 """Collect information required to use a proxy."""
698 def __init__(self, proxy_type, proxy_host, proxy_port, proxy_rdns=None, proxy_user=None, proxy_pass=None):
699 """The parameter proxy_type must be set to one of socks.PROXY_TYPE_XXX
700 constants. For example:
701
702p = ProxyInfo(proxy_type=socks.PROXY_TYPE_HTTP, proxy_host='localhost', proxy_port=8000)
703 """
704 self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns, self.proxy_user, self.proxy_pass = proxy_type, proxy_host, proxy_port, proxy_rdns, proxy_user, proxy_pass
705
706 def astuple(self):
707 return (self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns,
708 self.proxy_user, self.proxy_pass)
709
710 def isgood(self):
711 return (self.proxy_host != None) and (self.proxy_port != None)
712
713
714class HTTPConnectionWithTimeout(httplib.HTTPConnection):
715 """HTTPConnection subclass that supports timeouts"""
716
717 def __init__(self, host, port=None, strict=None, timeout=None, proxy_info=None):
718 httplib.HTTPConnection.__init__(self, host, port, strict)
719 self.timeout = timeout
720 self.proxy_info = proxy_info
721
722 def connect(self):
723 """Connect to the host and port specified in __init__."""
724 # Mostly verbatim from httplib.py.
Joe Gregorio5e3a5fa2010-10-11 13:03:56 -0400725 if self.proxy_info and socks is None:
726 raise ProxiesUnavailableError(
727 'Proxy support missing but proxy use was requested!')
Joe Gregorio845a5452010-09-08 13:50:34 -0400728 msg = "getaddrinfo returns an empty list"
729 for res in socket.getaddrinfo(self.host, self.port, 0,
730 socket.SOCK_STREAM):
731 af, socktype, proto, canonname, sa = res
732 try:
733 if self.proxy_info and self.proxy_info.isgood():
734 self.sock = socks.socksocket(af, socktype, proto)
735 self.sock.setproxy(*self.proxy_info.astuple())
736 else:
737 self.sock = socket.socket(af, socktype, proto)
738 self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
739 # Different from httplib: support timeouts.
740 if has_timeout(self.timeout):
741 self.sock.settimeout(self.timeout)
742 # End of difference from httplib.
743 if self.debuglevel > 0:
744 print "connect: (%s, %s)" % (self.host, self.port)
745
746 self.sock.connect(sa)
747 except socket.error, msg:
748 if self.debuglevel > 0:
749 print 'connect fail:', (self.host, self.port)
750 if self.sock:
751 self.sock.close()
752 self.sock = None
753 continue
754 break
755 if not self.sock:
756 raise socket.error, msg
757
758class HTTPSConnectionWithTimeout(httplib.HTTPSConnection):
759 "This class allows communication via SSL."
760
761 def __init__(self, host, port=None, key_file=None, cert_file=None,
762 strict=None, timeout=None, proxy_info=None):
763 httplib.HTTPSConnection.__init__(self, host, port=port, key_file=key_file,
764 cert_file=cert_file, strict=strict)
765 self.timeout = timeout
766 self.proxy_info = proxy_info
767
768 def connect(self):
769 "Connect to a host on a given (SSL) port."
770
771 if self.proxy_info and self.proxy_info.isgood():
772 sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM)
773 sock.setproxy(*self.proxy_info.astuple())
774 else:
775 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
776 sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
777
778 if has_timeout(self.timeout):
779 sock.settimeout(self.timeout)
780 sock.connect((self.host, self.port))
781 self.sock =_ssl_wrap_socket(sock, self.key_file, self.cert_file)
782
783
784
785class Http(object):
786 """An HTTP client that handles:
787- all methods
788- caching
789- ETags
790- compression,
791- HTTPS
792- Basic
793- Digest
794- WSSE
795
796and more.
797 """
798 def __init__(self, cache=None, timeout=None, proxy_info=None):
799 """The value of proxy_info is a ProxyInfo instance.
800
801If 'cache' is a string then it is used as a directory name
802for a disk cache. Otherwise it must be an object that supports
803the same interface as FileCache."""
804 self.proxy_info = proxy_info
805 # Map domain name to an httplib connection
806 self.connections = {}
807 # The location of the cache, for now a directory
808 # where cached responses are held.
809 if cache and isinstance(cache, str):
810 self.cache = FileCache(cache)
811 else:
812 self.cache = cache
813
814 # Name/password
815 self.credentials = Credentials()
816
817 # Key/cert
818 self.certificates = KeyCerts()
819
820 # authorization objects
821 self.authorizations = []
822
823 # If set to False then no redirects are followed, even safe ones.
824 self.follow_redirects = True
825
826 # Which HTTP methods do we apply optimistic concurrency to, i.e.
827 # which methods get an "if-match:" etag header added to them.
828 self.optimistic_concurrency_methods = ["PUT"]
829
830 # If 'follow_redirects' is True, and this is set to True then
831 # all redirecs are followed, including unsafe ones.
832 self.follow_all_redirects = False
833
834 self.ignore_etag = False
835
836 self.force_exception_to_status_code = False
837
838 self.timeout = timeout
839
840 def _auth_from_challenge(self, host, request_uri, headers, response, content):
841 """A generator that creates Authorization objects
842 that can be applied to requests.
843 """
844 challenges = _parse_www_authenticate(response, 'www-authenticate')
845 for cred in self.credentials.iter(host):
846 for scheme in AUTH_SCHEME_ORDER:
847 if challenges.has_key(scheme):
848 yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self)
849
850 def add_credentials(self, name, password, domain=""):
851 """Add a name and password that will be used
852 any time a request requires authentication."""
853 self.credentials.add(name, password, domain)
854
855 def add_certificate(self, key, cert, domain):
856 """Add a key and cert that will be used
857 any time a request requires authentication."""
858 self.certificates.add(key, cert, domain)
859
860 def clear_credentials(self):
861 """Remove all the names and passwords
862 that are used for authentication"""
863 self.credentials.clear()
864 self.authorizations = []
865
866 def _conn_request(self, conn, request_uri, method, body, headers):
867 for i in range(2):
868 try:
869 conn.request(method, request_uri, body, headers)
870 except socket.gaierror:
871 conn.close()
872 raise ServerNotFoundError("Unable to find the server at %s" % conn.host)
873 except socket.error, e:
874 if e.errno == errno.ECONNREFUSED: # Connection refused
875 raise
876 except httplib.HTTPException:
877 # Just because the server closed the connection doesn't apparently mean
878 # that the server didn't send a response.
879 pass
880 try:
881 response = conn.getresponse()
882 except (socket.error, httplib.HTTPException):
883 if i == 0:
884 conn.close()
885 conn.connect()
886 continue
887 else:
888 raise
889 else:
890 content = ""
891 if method == "HEAD":
892 response.close()
893 else:
894 content = response.read()
895 response = Response(response)
896 if method != "HEAD":
897 content = _decompressContent(response, content)
898 break
899 return (response, content)
900
901
902 def _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey):
903 """Do the actual request using the connection object
904 and also follow one level of redirects if necessary"""
905
906 auths = [(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]
907 auth = auths and sorted(auths)[0][1] or None
908 if auth:
909 auth.request(method, request_uri, headers, body)
910
911 (response, content) = self._conn_request(conn, request_uri, method, body, headers)
912
913 if auth:
914 if auth.response(response, body):
915 auth.request(method, request_uri, headers, body)
916 (response, content) = self._conn_request(conn, request_uri, method, body, headers )
917 response._stale_digest = 1
918
919 if response.status == 401:
920 for authorization in self._auth_from_challenge(host, request_uri, headers, response, content):
921 authorization.request(method, request_uri, headers, body)
922 (response, content) = self._conn_request(conn, request_uri, method, body, headers, )
923 if response.status != 401:
924 self.authorizations.append(authorization)
925 authorization.response(response, body)
926 break
927
928 if (self.follow_all_redirects or (method in ["GET", "HEAD"]) or response.status == 303):
929 if self.follow_redirects and response.status in [300, 301, 302, 303, 307]:
930 # Pick out the location header and basically start from the beginning
931 # remembering first to strip the ETag header and decrement our 'depth'
932 if redirections:
933 if not response.has_key('location') and response.status != 300:
934 raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."), response, content)
935 # Fix-up relative redirects (which violate an RFC 2616 MUST)
936 if response.has_key('location'):
937 location = response['location']
938 (scheme, authority, path, query, fragment) = parse_uri(location)
939 if authority == None:
940 response['location'] = urlparse.urljoin(absolute_uri, location)
941 if response.status == 301 and method in ["GET", "HEAD"]:
942 response['-x-permanent-redirect-url'] = response['location']
943 if not response.has_key('content-location'):
944 response['content-location'] = absolute_uri
945 _updateCache(headers, response, content, self.cache, cachekey)
946 if headers.has_key('if-none-match'):
947 del headers['if-none-match']
948 if headers.has_key('if-modified-since'):
949 del headers['if-modified-since']
950 if response.has_key('location'):
951 location = response['location']
952 old_response = copy.deepcopy(response)
953 if not old_response.has_key('content-location'):
954 old_response['content-location'] = absolute_uri
955 redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
956 (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
957 response.previous = old_response
958 else:
959 raise RedirectLimit( _("Redirected more times than rediection_limit allows."), response, content)
960 elif response.status in [200, 203] and method == "GET":
961 # Don't cache 206's since we aren't going to handle byte range requests
962 if not response.has_key('content-location'):
963 response['content-location'] = absolute_uri
964 _updateCache(headers, response, content, self.cache, cachekey)
965
966 return (response, content)
967
968 def _normalize_headers(self, headers):
969 return _normalize_headers(headers)
970
971# Need to catch and rebrand some exceptions
972# Then need to optionally turn all exceptions into status codes
973# including all socket.* and httplib.* exceptions.
974
975
976 def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None):
977 """ Performs a single HTTP request.
978The 'uri' is the URI of the HTTP resource and can begin
979with either 'http' or 'https'. The value of 'uri' must be an absolute URI.
980
981The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc.
982There is no restriction on the methods allowed.
983
984The 'body' is the entity body to be sent with the request. It is a string
985object.
986
987Any extra headers that are to be sent with the request should be provided in the
988'headers' dictionary.
989
990The maximum number of redirect to follow before raising an
991exception is 'redirections. The default is 5.
992
993The return value is a tuple of (response, content), the first
994being and instance of the 'Response' class, the second being
995a string that contains the response entity body.
996 """
997 try:
998 if headers is None:
999 headers = {}
1000 else:
1001 headers = self._normalize_headers(headers)
1002
1003 if not headers.has_key('user-agent'):
1004 headers['user-agent'] = "Python-httplib2/%s" % __version__
1005
1006 uri = iri2uri(uri)
1007
1008 (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
1009 domain_port = authority.split(":")[0:2]
1010 if len(domain_port) == 2 and domain_port[1] == '443' and scheme == 'http':
1011 scheme = 'https'
1012 authority = domain_port[0]
1013
1014 conn_key = scheme+":"+authority
1015 if conn_key in self.connections:
1016 conn = self.connections[conn_key]
1017 else:
1018 if not connection_type:
1019 connection_type = (scheme == 'https') and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout
1020 certs = list(self.certificates.iter(authority))
1021 if scheme == 'https' and certs:
1022 conn = self.connections[conn_key] = connection_type(authority, key_file=certs[0][0],
1023 cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info)
1024 else:
1025 conn = self.connections[conn_key] = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info)
1026 conn.set_debuglevel(debuglevel)
1027
1028 if method in ["GET", "HEAD"] and 'range' not in headers and 'accept-encoding' not in headers:
1029 headers['accept-encoding'] = 'gzip, deflate'
1030
1031 info = email.Message.Message()
1032 cached_value = None
1033 if self.cache:
1034 cachekey = defrag_uri
1035 cached_value = self.cache.get(cachekey)
1036 if cached_value:
1037 # info = email.message_from_string(cached_value)
1038 #
1039 # Need to replace the line above with the kludge below
1040 # to fix the non-existent bug not fixed in this
1041 # bug report: http://mail.python.org/pipermail/python-bugs-list/2005-September/030289.html
1042 try:
1043 info, content = cached_value.split('\r\n\r\n', 1)
1044 feedparser = email.FeedParser.FeedParser()
1045 feedparser.feed(info)
1046 info = feedparser.close()
1047 feedparser._parse = None
1048 except IndexError:
1049 self.cache.delete(cachekey)
1050 cachekey = None
1051 cached_value = None
1052 else:
1053 cachekey = None
1054
1055 if method in self.optimistic_concurrency_methods and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers:
1056 # http://www.w3.org/1999/04/Editing/
1057 headers['if-match'] = info['etag']
1058
1059 if method not in ["GET", "HEAD"] and self.cache and cachekey:
1060 # RFC 2616 Section 13.10
1061 self.cache.delete(cachekey)
1062
1063 # Check the vary header in the cache to see if this request
1064 # matches what varies in the cache.
1065 if method in ['GET', 'HEAD'] and 'vary' in info:
1066 vary = info['vary']
1067 vary_headers = vary.lower().replace(' ', '').split(',')
1068 for header in vary_headers:
1069 key = '-varied-%s' % header
1070 value = info[key]
1071 if headers.get(header, None) != value:
1072 cached_value = None
1073 break
1074
1075 if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
1076 if info.has_key('-x-permanent-redirect-url'):
1077 # Should cached permanent redirects be counted in our redirection count? For now, yes.
1078 (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
1079 response.previous = Response(info)
1080 response.previous.fromcache = True
1081 else:
1082 # Determine our course of action:
1083 # Is the cached entry fresh or stale?
1084 # Has the client requested a non-cached response?
1085 #
1086 # There seems to be three possible answers:
1087 # 1. [FRESH] Return the cache entry w/o doing a GET
1088 # 2. [STALE] Do the GET (but add in cache validators if available)
1089 # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
1090 entry_disposition = _entry_disposition(info, headers)
1091
1092 if entry_disposition == "FRESH":
1093 if not cached_value:
1094 info['status'] = '504'
1095 content = ""
1096 response = Response(info)
1097 if cached_value:
1098 response.fromcache = True
1099 return (response, content)
1100
1101 if entry_disposition == "STALE":
1102 if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers:
1103 headers['if-none-match'] = info['etag']
1104 if info.has_key('last-modified') and not 'last-modified' in headers:
1105 headers['if-modified-since'] = info['last-modified']
1106 elif entry_disposition == "TRANSPARENT":
1107 pass
1108
1109 (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1110
1111 if response.status == 304 and method == "GET":
1112 # Rewrite the cache entry with the new end-to-end headers
1113 # Take all headers that are in response
1114 # and overwrite their values in info.
1115 # unless they are hop-by-hop, or are listed in the connection header.
1116
1117 for key in _get_end2end_headers(response):
1118 info[key] = response[key]
1119 merged_response = Response(info)
1120 if hasattr(response, "_stale_digest"):
1121 merged_response._stale_digest = response._stale_digest
1122 _updateCache(headers, merged_response, content, self.cache, cachekey)
1123 response = merged_response
1124 response.status = 200
1125 response.fromcache = True
1126
1127 elif response.status == 200:
1128 content = new_content
1129 else:
1130 self.cache.delete(cachekey)
1131 content = new_content
1132 else:
1133 cc = _parse_cache_control(headers)
1134 if cc.has_key('only-if-cached'):
1135 info['status'] = '504'
1136 response = Response(info)
1137 content = ""
1138 else:
1139 (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1140 except Exception, e:
1141 if self.force_exception_to_status_code:
1142 if isinstance(e, HttpLib2ErrorWithResponse):
1143 response = e.response
1144 content = e.content
1145 response.status = 500
1146 response.reason = str(e)
1147 elif isinstance(e, socket.timeout):
1148 content = "Request Timeout"
1149 response = Response( {
1150 "content-type": "text/plain",
1151 "status": "408",
1152 "content-length": len(content)
1153 })
1154 response.reason = "Request Timeout"
1155 else:
1156 content = str(e)
1157 response = Response( {
1158 "content-type": "text/plain",
1159 "status": "400",
1160 "content-length": len(content)
1161 })
1162 response.reason = "Bad Request"
1163 else:
1164 raise
1165
1166
1167 return (response, content)
1168
1169
1170
1171class Response(dict):
1172 """An object more like email.Message than httplib.HTTPResponse."""
1173
1174 """Is this response from our local cache"""
1175 fromcache = False
1176
1177 """HTTP protocol version used by server. 10 for HTTP/1.0, 11 for HTTP/1.1. """
1178 version = 11
1179
1180 "Status code returned by server. "
1181 status = 200
1182
1183 """Reason phrase returned by server."""
1184 reason = "Ok"
1185
1186 previous = None
1187
1188 def __init__(self, info):
1189 # info is either an email.Message or
1190 # an httplib.HTTPResponse object.
1191 if isinstance(info, httplib.HTTPResponse):
1192 for key, value in info.getheaders():
1193 self[key.lower()] = value
1194 self.status = info.status
1195 self['status'] = str(self.status)
1196 self.reason = info.reason
1197 self.version = info.version
1198 elif isinstance(info, email.Message.Message):
1199 for key, value in info.items():
1200 self[key] = value
1201 self.status = int(self['status'])
1202 else:
1203 for key, value in info.iteritems():
1204 self[key] = value
1205 self.status = int(self.get('status', self.status))
1206
1207
1208 def __getattr__(self, name):
1209 if name == 'dict':
1210 return self
1211 else:
1212 raise AttributeError, name