Merged revisions 76719,81270-81272,83294,83319,84038-84039 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/branches/py3k
................
r76719 | antoine.pitrou | 2009-12-08 20:38:17 +0100 (mar., 08 déc. 2009) | 9 lines
Merged revisions 76718 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
r76718 | antoine.pitrou | 2009-12-08 20:35:12 +0100 (mar., 08 déc. 2009) | 3 lines
Fix transient refleaks in test_urllib. Thanks to Florent Xicluna.
........
................
r81270 | florent.xicluna | 2010-05-17 19:24:07 +0200 (lun., 17 mai 2010) | 9 lines
Merged revision 81259 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
r81259 | florent.xicluna | 2010-05-17 12:39:07 +0200 (lun, 17 mai 2010) | 2 lines
Slight style cleanup.
........
................
r81271 | florent.xicluna | 2010-05-17 19:33:07 +0200 (lun., 17 mai 2010) | 11 lines
Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes, unquote, unquote_to_bytes.
Recorded merge of revisions 81265 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
r81265 | florent.xicluna | 2010-05-17 15:35:09 +0200 (lun, 17 mai 2010) | 2 lines
Issue #1285086: Speed up urllib.quote and urllib.unquote for simple cases.
........
................
r81272 | florent.xicluna | 2010-05-17 20:01:22 +0200 (lun., 17 mai 2010) | 2 lines
Inadvertently removed part of the comment in r81271.
................
r83294 | senthil.kumaran | 2010-07-30 21:34:36 +0200 (ven., 30 juil. 2010) | 2 lines
Fix issue9301 - handle unquote({}) kind of case.
................
r83319 | florent.xicluna | 2010-07-31 10:56:55 +0200 (sam., 31 juil. 2010) | 2 lines
Fix an oversight in r83294. unquote() should reject bytes. Issue #9301.
................
r84038 | florent.xicluna | 2010-08-14 20:30:35 +0200 (sam., 14 août 2010) | 1 line
Silence the BytesWarning, due to patch r83294 for #9301
................
r84039 | florent.xicluna | 2010-08-14 22:51:58 +0200 (sam., 14 août 2010) | 1 line
Silence BytesWarning while testing exception
................
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 886c51c..765f1c8 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -39,7 +39,7 @@
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
'imap', 'wais', 'file', 'mms', 'https', 'shttp',
'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
- 'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
+ 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
@@ -61,8 +61,9 @@
_parse_cache = {}
def clear_cache():
- """Clear the parse cache."""
+ """Clear the parse cache and the quoters cache."""
_parse_cache.clear()
+ _safe_quoters.clear()
class ResultMixin(object):
@@ -302,17 +303,22 @@
"""unquote_to_bytes('abc%20def') -> b'abc def'."""
# Note: strings are encoded as UTF-8. This is only an issue if it contains
# unescaped non-ASCII characters, which URIs should not.
+ if not string:
+ # Is it a string-like object?
+ string.split
+ return b''
if isinstance(string, str):
string = string.encode('utf-8')
res = string.split(b'%')
- res[0] = res[0]
- for i in range(1, len(res)):
- item = res[i]
+ if len(res) == 1:
+ return string
+ string = res[0]
+ for item in res[1:]:
try:
- res[i] = bytes([int(item[:2], 16)]) + item[2:]
+ string += bytes([int(item[:2], 16)]) + item[2:]
except ValueError:
- res[i] = b'%' + item
- return b''.join(res)
+ string += b'%' + item
+ return string
def unquote(string, encoding='utf-8', errors='replace'):
"""Replace %xx escapes by their single-character equivalent. The optional
@@ -324,36 +330,39 @@
unquote('abc%20def') -> 'abc def'.
"""
- if encoding is None: encoding = 'utf-8'
- if errors is None: errors = 'replace'
- # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
- # (list of single-byte bytes objects)
- pct_sequence = []
+ if string == '':
+ return string
res = string.split('%')
- for i in range(1, len(res)):
- item = res[i]
+ if len(res) == 1:
+ return string
+ if encoding is None:
+ encoding = 'utf-8'
+ if errors is None:
+ errors = 'replace'
+ # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
+ pct_sequence = b''
+ string = res[0]
+ for item in res[1:]:
try:
- if not item: raise ValueError
- pct_sequence.append(bytes.fromhex(item[:2]))
+ if not item:
+ raise ValueError
+ pct_sequence += bytes.fromhex(item[:2])
rest = item[2:]
+ if not rest:
+ # This segment was just a single percent-encoded character.
+ # May be part of a sequence of code units, so delay decoding.
+ # (Stored in pct_sequence).
+ continue
except ValueError:
rest = '%' + item
- if not rest:
- # This segment was just a single percent-encoded character.
- # May be part of a sequence of code units, so delay decoding.
- # (Stored in pct_sequence).
- res[i] = ''
- else:
- # Encountered non-percent-encoded characters. Flush the current
- # pct_sequence.
- res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
- pct_sequence = []
+ # Encountered non-percent-encoded characters. Flush the current
+ # pct_sequence.
+ string += pct_sequence.decode(encoding, errors) + rest
+ pct_sequence = b''
if pct_sequence:
# Flush the final pct_sequence
- # res[-1] will always be empty if pct_sequence != []
- assert not res[-1], "string=%r, res=%r" % (string, res)
- res[-1] = b''.join(pct_sequence).decode(encoding, errors)
- return ''.join(res)
+ string += pct_sequence.decode(encoding, errors)
+ return string
def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
"""Parse a query given as a string argument.
@@ -434,7 +443,8 @@
b'abcdefghijklmnopqrstuvwxyz'
b'0123456789'
b'_.-')
-_safe_quoters= {}
+_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
+_safe_quoters = {}
class Quoter(collections.defaultdict):
"""A mapping from bytes (in range(0,256)) to strings.
@@ -446,7 +456,7 @@
# of cached keys don't call Python code at all).
def __init__(self, safe):
"""safe: bytes object."""
- self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
+ self.safe = _ALWAYS_SAFE.union(safe)
def __repr__(self):
# Without this, will just display as a defaultdict
@@ -454,7 +464,7 @@
def __missing__(self, b):
# Handle a cache miss. Store quoted string in cache and return.
- res = b in self.safe and chr(b) or ('%%%02X' % b)
+ res = chr(b) if b in self.safe else '%{:02X}'.format(b)
self[b] = res
return res
@@ -488,6 +498,8 @@
errors='strict' (unsupported characters raise a UnicodeEncodeError).
"""
if isinstance(string, str):
+ if not string:
+ return string
if encoding is None:
encoding = 'utf-8'
if errors is None:
@@ -522,18 +534,22 @@
not perform string-to-bytes encoding. It always returns an ASCII string.
quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
"""
+ if not isinstance(bs, (bytes, bytearray)):
+ raise TypeError("quote_from_bytes() expected bytes")
+ if not bs:
+ return ''
if isinstance(safe, str):
# Normalize 'safe' by converting to bytes and removing non-ASCII chars
safe = safe.encode('ascii', 'ignore')
- cachekey = bytes(safe) # In case it was a bytearray
- if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
- raise TypeError("quote_from_bytes() expected a bytes")
+ else:
+ safe = bytes([c for c in safe if c < 128])
+ if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
+ return bs.decode()
try:
- quoter = _safe_quoters[cachekey]
+ quoter = _safe_quoters[safe]
except KeyError:
- quoter = Quoter(safe)
- _safe_quoters[cachekey] = quoter
- return ''.join([quoter[char] for char in bs])
+ _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
+ return ''.join([quoter(char) for char in bs])
def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
"""Encode a sequence of two-element tuples or dictionary into a URL query string.