Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().
diff --git a/Lib/urllib.py b/Lib/urllib.py
index 33641a5..f9655f9 100644
--- a/Lib/urllib.py
+++ b/Lib/urllib.py
@@ -28,6 +28,7 @@
import time
import sys
import base64
+import re
from urlparse import urljoin as basejoin
@@ -1198,22 +1199,35 @@
_hexdig = '0123456789ABCDEFabcdef'
_hextochr = dict((a + b, chr(int(a + b, 16)))
for a in _hexdig for b in _hexdig)
+_asciire = re.compile('([\x00-\x7f]+)')
def unquote(s):
"""unquote('abc%20def') -> 'abc def'."""
- res = s.split('%')
+ if _is_unicode(s):
+ if '%' not in s:
+ return s
+ bits = _asciire.split(s)
+ res = [bits[0]]
+ append = res.append
+ for i in range(1, len(bits), 2):
+ append(unquote(str(bits[i])).decode('latin1'))
+ append(bits[i + 1])
+ return ''.join(res)
+
+ bits = s.split('%')
# fastpath
- if len(res) == 1:
+ if len(bits) == 1:
return s
- s = res[0]
- for item in res[1:]:
+ res = [bits[0]]
+ append = res.append
+ for item in bits[1:]:
try:
- s += _hextochr[item[:2]] + item[2:]
+ append(_hextochr[item[:2]])
+ append(item[2:])
except KeyError:
- s += '%' + item
- except UnicodeDecodeError:
- s += unichr(int(item[:2], 16)) + item[2:]
- return s
+ append('%')
+ append(item)
+ return ''.join(res)
def unquote_plus(s):
"""unquote('%7e/abc+def') -> '~/abc def'"""
diff --git a/Lib/urlparse.py b/Lib/urlparse.py
index f370ce3..4ce982e 100644
--- a/Lib/urlparse.py
+++ b/Lib/urlparse.py
@@ -28,6 +28,8 @@
"""
+import re
+
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
"urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
@@ -311,6 +313,15 @@
else:
return url, ''
+try:
+ unicode
+except NameError:
+ def _is_unicode(x):
+ return 0
+else:
+ def _is_unicode(x):
+ return isinstance(x, unicode)
+
# unquote method for parse_qs and parse_qsl
# Cannot use directly from urllib as it would create a circular reference
# because urllib uses urlparse methods (urljoin). If you update this function,
@@ -319,22 +330,35 @@
_hexdig = '0123456789ABCDEFabcdef'
_hextochr = dict((a+b, chr(int(a+b,16)))
for a in _hexdig for b in _hexdig)
+_asciire = re.compile('([\x00-\x7f]+)')
def unquote(s):
"""unquote('abc%20def') -> 'abc def'."""
- res = s.split('%')
+ if _is_unicode(s):
+ if '%' not in s:
+ return s
+ bits = _asciire.split(s)
+ res = [bits[0]]
+ append = res.append
+ for i in range(1, len(bits), 2):
+ append(unquote(str(bits[i])).decode('latin1'))
+ append(bits[i + 1])
+ return ''.join(res)
+
+ bits = s.split('%')
# fastpath
- if len(res) == 1:
+ if len(bits) == 1:
return s
- s = res[0]
- for item in res[1:]:
+ res = [bits[0]]
+ append = res.append
+ for item in bits[1:]:
try:
- s += _hextochr[item[:2]] + item[2:]
+ append(_hextochr[item[:2]])
+ append(item[2:])
except KeyError:
- s += '%' + item
- except UnicodeDecodeError:
- s += unichr(int(item[:2], 16)) + item[2:]
- return s
+ append('%')
+ append(item)
+ return ''.join(res)
def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
"""Parse a query given as a string argument.
diff --git a/Misc/NEWS b/Misc/NEWS
index ae0402b..10c3bd0 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -214,6 +214,8 @@
Library
-------
+- Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().
+
- Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused
a failure while decoding empty object literals when object_pairs_hook was
specified.