Fixed bug 1461941

commit: fd22e434835b9c2cfddcba21e1a8722a1e6c2b9a [log] [tgz]
author: jcgregorio <devnull@localhost> Thu Apr 27 02:00:08 2006 +0000
committer: jcgregorio <devnull@localhost> Thu Apr 27 02:00:08 2006 +0000
tree: 5184460ac7927a2e604ac826574c2ae2b533e981
parent: 0bf729291e612a39952eddde507871a7009ddf7c [diff]
diff --git a/httplib2/__init__.py b/httplib2/__init__.py
index 8f58486..1dd2029 100644
--- a/httplib2/__init__.py
+++ b/httplib2/__init__.py

@@ -106,8 +106,9 @@
     groups = URI.match(uri).groups()
     return (groups[1], groups[3], groups[4], groups[6], groups[8])
 
+NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
 def _normalize_headers(headers):
-    return dict([ (key.lower(), value)  for (key, value) in headers.iteritems()])
+    return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip())  for (key, value) in headers.iteritems()])
 
 def _parse_cache_control(headers):
     retval = {}
@@ -118,15 +119,27 @@
         retval = dict(parts_with_args + parts_wo_args)
     return retval 
 
-WWW_AUTH = re.compile(r"^(?:,?\s*([a-zA-Z0-9_-]+)\s*=\s*\"((?:[^\\\"]|\\.)*?)\")(.*)$")
-# Yes, some parameters don't have quotes. Why again am I spending so much time doing HTTP?
-WWW_AUTH2 = re.compile(r"^(?:,?\s*([a-zA-Z0-9_-]+)\s*=\s*(\w+))(.*)$")
+# Whether to use a strict mode to parse WWW-Authenticate headers
+# Might lead to bad results in case of ill-formed header value,
+# so disabled by default, falling back to relaxed parsing.
+# Set to true to turn on, usefull for testing servers.
+USE_WWW_AUTH_STRICT_PARSING = 0
+
+# In regex below:
+#    [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+             matches a "token" as defined by HTTP
+#    "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?"    matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space
+# Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both:
+#    \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?
+WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$")
+WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$")
+UNQUOTE_PAIRS = re.compile(r'\\(.)')
 def _parse_www_authenticate(headers, headername='www-authenticate'):
     """Returns a dictionary of dictionaries, one dict
     per auth_scheme."""
     retval = {}
     if headers.has_key(headername):
         authenticate = headers[headername].strip()
+        www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED
         while authenticate:
             # Break off the scheme at the beginning of the line
             if headername == 'authentication-info':
@@ -135,18 +148,13 @@
                 (auth_scheme, the_rest) = authenticate.split(" ", 1)
             # Now loop over all the key value pairs that come after the scheme, 
             # being careful not to roll into the next scheme
-            match = WWW_AUTH.search(the_rest)
-            match2 = WWW_AUTH2.search(the_rest)
+            match = www_auth.search(the_rest)
             auth_params = {}
-            while match or match2:
-                if match2 and len(match2.groups()) == 3:
-                    (key, value, the_rest) = match2.groups()
-                    auth_params[key.lower()] = value
-                elif match and len(match.groups()) == 3:
+            while match:
+                if match and len(match.groups()) == 3:
                     (key, value, the_rest) = match.groups()
-                    auth_params[key.lower()] = value
-                match = WWW_AUTH.search(the_rest)
-                match2 = WWW_AUTH2.search(the_rest)
+                    auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')])
+                match = www_auth.search(the_rest)
             retval[auth_scheme.lower()] = auth_params
             authenticate = the_rest.strip()
     return retval

diff --git a/httplib2test.py b/httplib2test.py
index 20820be..9d04eab 100755
--- a/httplib2test.py
+++ b/httplib2test.py

@@ -701,6 +701,27 @@
         res = httplib2._parse_www_authenticate({})
         self.assertEqual(len(res.keys()), 0) 
 
+    def testParseWWWAuthenticate(self):
+        # different uses of spaces around commas
+        res = httplib2._parse_www_authenticate({ 'www-authenticate': 'Test realm="test realm" , foo=foo ,bar="bar", baz=baz,qux=qux'})
+        self.assertEqual(len(res.keys()), 1)
+        self.assertEqual(len(res['test'].keys()), 5)
+        
+        # tokens with non-alphanum
+        res = httplib2._parse_www_authenticate({ 'www-authenticate': 'T*!%#st realm=to*!%#en, to*!%#en="quoted string"'})
+        self.assertEqual(len(res.keys()), 1)
+        self.assertEqual(len(res['t*!%#st'].keys()), 2)
+        
+        # quoted string with quoted pairs
+        res = httplib2._parse_www_authenticate({ 'www-authenticate': 'Test realm="a \\"test\\" realm"'})
+        self.assertEqual(len(res.keys()), 1)
+        self.assertEqual(res['test']['realm'], 'a "test" realm')
+
+    def testParseWWWAuthenticateStrict(self):
+        httplib2.USE_WWW_AUTH_STRICT_PARSING = 1;
+        self.testParseWWWAuthenticate();
+        httplib2.USE_WWW_AUTH_STRICT_PARSING = 0;
+
     def testParseWWWAuthenticateBasic(self):
         res = httplib2._parse_www_authenticate({ 'www-authenticate': 'Basic realm="me"'})
         basic = res['basic']
commit	fd22e434835b9c2cfddcba21e1a8722a1e6c2b9a	[log] [tgz]
author	jcgregorio <devnull@localhost>	Thu Apr 27 02:00:08 2006 +0000
committer	jcgregorio <devnull@localhost>	Thu Apr 27 02:00:08 2006 +0000
tree	5184460ac7927a2e604ac826574c2ae2b533e981
parent	0bf729291e612a39952eddde507871a7009ddf7c [diff]