Fix Issue5468 - urlencode to handle bytes and other alternate encodings.
(Extensive tests provided). Patch by Dan Mahn.
diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst
index 3d13efc..add07a5 100644
--- a/Doc/library/urllib.parse.rst
+++ b/Doc/library/urllib.parse.rst
@@ -310,23 +310,29 @@
``b'a&\xef'``.
-.. function:: urlencode(query, doseq=False)
+.. function:: urlencode(query, doseq=False, safe='', encoding=None, errors=None)
- Convert a mapping object or a sequence of two-element tuples to a
- "url-encoded" string, suitable to pass to :func:`urlopen` above as the
- optional *data* argument. This is useful to pass a dictionary of form
- fields to a ``POST`` request. The resulting string is a series of
- ``key=value`` pairs separated by ``'&'`` characters, where both *key* and
- *value* are quoted using :func:`quote_plus` above. When a sequence of
- two-element tuples is used as the *query* argument, the first element of
- each tuple is a key and the second is a value. The value element in itself
- can be a sequence and in that case, if the optional parameter *doseq* is
- evaluates to *True*, individual ``key=value`` pairs separated by ``'&'`` are
- generated for each element of the value sequence for the key. The order of
- parameters in the encoded string will match the order of parameter tuples in
- the sequence. This module provides the functions :func:`parse_qs` and
- :func:`parse_qsl` which are used to parse query strings into Python data
- structures.
+ Convert a mapping object or a sequence of two-element tuples, which may
+ either be a :class:`str` or a :class:`bytes`, to a "url-encoded" string,
+ suitable to pass to :func:`urlopen` above as the optional *data* argument.
+ This is useful to pass a dictionary of form fields to a ``POST`` request.
+ The resulting string is a series of ``key=value`` pairs separated by ``'&'``
+ characters, where both *key* and *value* are quoted using :func:`quote_plus`
+ above. When a sequence of two-element tuples is used as the *query*
+ argument, the first element of each tuple is a key and the second is a
+ value. The value element in itself can be a sequence and in that case, if
+ the optional parameter *doseq* is evaluates to *True*, individual
+ ``key=value`` pairs separated by ``'&'`` are generated for each element of
+ the value sequence for the key. The order of parameters in the encoded
+ string will match the order of parameter tuples in the sequence. This module
+ provides the functions :func:`parse_qs` and :func:`parse_qsl` which are used
+ to parse query strings into Python data structures.
+
+ When *query* parameter is a :class:`str`, the *safe*, *encoding* and *error*
+ parameters are sent the :func:`quote_plus` for encoding.
+
+ .. versionchanged:: 3.2
+ query paramater supports bytes and string.
.. seealso::
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
index 597678d..e293cf0 100644
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -795,6 +795,116 @@
self.assertEqual("a=a&a=b",
urllib.parse.urlencode({"a": {"a": 1, "b": 1}}, True))
+ def test_urlencode_encoding(self):
+ # ASCII encoding. Expect %3F with errors="replace'
+ given = (('\u00a0', '\u00c1'),)
+ expect = '%3F=%3F'
+ result = urllib.parse.urlencode(given, encoding="ASCII", errors="replace")
+ self.assertEqual(expect, result)
+
+ # Default is UTF-8 encoding.
+ given = (('\u00a0', '\u00c1'),)
+ expect = '%C2%A0=%C3%81'
+ result = urllib.parse.urlencode(given)
+ self.assertEqual(expect, result)
+
+ # Latin-1 encoding.
+ given = (('\u00a0', '\u00c1'),)
+ expect = '%A0=%C1'
+ result = urllib.parse.urlencode(given, encoding="latin-1")
+ self.assertEqual(expect, result)
+
+ def test_urlencode_encoding_doseq(self):
+ # ASCII Encoding. Expect %3F with errors="replace'
+ given = (('\u00a0', '\u00c1'),)
+ expect = '%3F=%3F'
+ result = urllib.parse.urlencode(given, doseq=True,
+ encoding="ASCII", errors="replace")
+ self.assertEqual(expect, result)
+
+ # ASCII Encoding. On a sequence of values.
+ given = (("\u00a0", (1, "\u00c1")),)
+ expect = '%3F=1&%3F=%3F'
+ result = urllib.parse.urlencode(given, True,
+ encoding="ASCII", errors="replace")
+ self.assertEqual(expect, result)
+
+ # Utf-8
+ given = (("\u00a0", "\u00c1"),)
+ expect = '%C2%A0=%C3%81'
+ result = urllib.parse.urlencode(given, True)
+ self.assertEqual(expect, result)
+
+ given = (("\u00a0", (42, "\u00c1")),)
+ expect = '%C2%A0=42&%C2%A0=%C3%81'
+ result = urllib.parse.urlencode(given, True)
+ self.assertEqual(expect, result)
+
+ # latin-1
+ given = (("\u00a0", "\u00c1"),)
+ expect = '%A0=%C1'
+ result = urllib.parse.urlencode(given, True, encoding="latin-1")
+ self.assertEqual(expect, result)
+
+ given = (("\u00a0", (42, "\u00c1")),)
+ expect = '%A0=42&%A0=%C1'
+ result = urllib.parse.urlencode(given, True, encoding="latin-1")
+ self.assertEqual(expect, result)
+
+ def test_urlencode_bytes(self):
+ given = ((b'\xa0\x24', b'\xc1\x24'),)
+ expect = '%A0%24=%C1%24'
+ result = urllib.parse.urlencode(given)
+ self.assertEqual(expect, result)
+ result = urllib.parse.urlencode(given, True)
+ self.assertEqual(expect, result)
+
+ # Sequence of values
+ given = ((b'\xa0\x24', (42, b'\xc1\x24')),)
+ expect = '%A0%24=42&%A0%24=%C1%24'
+ result = urllib.parse.urlencode(given, True)
+ self.assertEqual(expect, result)
+
+ def test_urlencode_encoding_safe_parameter(self):
+
+ # Send '$' (\x24) as safe character
+ # Default utf-8 encoding
+
+ given = ((b'\xa0\x24', b'\xc1\x24'),)
+ result = urllib.parse.urlencode(given, safe=":$")
+ expect = '%A0$=%C1$'
+ self.assertEqual(expect, result)
+
+ given = ((b'\xa0\x24', b'\xc1\x24'),)
+ result = urllib.parse.urlencode(given, doseq=True, safe=":$")
+ expect = '%A0$=%C1$'
+ self.assertEqual(expect, result)
+
+ # Safe parameter in sequence
+ given = ((b'\xa0\x24', (b'\xc1\x24', 0xd, 42)),)
+ expect = '%A0$=%C1$&%A0$=13&%A0$=42'
+ result = urllib.parse.urlencode(given, True, safe=":$")
+ self.assertEqual(expect, result)
+
+ # Test all above in latin-1 encoding
+
+ given = ((b'\xa0\x24', b'\xc1\x24'),)
+ result = urllib.parse.urlencode(given, safe=":$",
+ encoding="latin-1")
+ expect = '%A0$=%C1$'
+ self.assertEqual(expect, result)
+
+ given = ((b'\xa0\x24', b'\xc1\x24'),)
+ expect = '%A0$=%C1$'
+ result = urllib.parse.urlencode(given, doseq=True, safe=":$",
+ encoding="latin-1")
+
+ given = ((b'\xa0\x24', (b'\xc1\x24', 0xd, 42)),)
+ expect = '%A0$=%C1$&%A0$=13&%A0$=42'
+ result = urllib.parse.urlencode(given, True, safe=":$",
+ encoding="latin-1")
+ self.assertEqual(expect, result)
+
class Pathname_Tests(unittest.TestCase):
"""Test pathname2url() and url2pathname()"""
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index ffb0ff7..c6ebcc9 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -559,7 +559,7 @@
_safe_quoters[safe] = quoter = Quoter(safe).__getitem__
return ''.join([quoter(char) for char in bs])
-def urlencode(query, doseq=False):
+def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
If any values in the query arg are sequences and doseq is true, each
@@ -568,6 +568,10 @@
If the query arg is a sequence of two-element tuples, the order of the
parameters in the output will match the order of parameters in the
input.
+
+ The query arg may be either a string or a bytes type. When query arg is a
+ string, the safe, encoding and error parameters are sent the quote_plus for
+ encoding.
"""
if hasattr(query, "items"):
@@ -592,14 +596,28 @@
l = []
if not doseq:
for k, v in query:
- k = quote_plus(str(k))
- v = quote_plus(str(v))
+ if isinstance(k, bytes):
+ k = quote_plus(k, safe)
+ else:
+ k = quote_plus(str(k), safe, encoding, errors)
+
+ if isinstance(v, bytes):
+ v = quote_plus(v, safe)
+ else:
+ v = quote_plus(str(v), safe, encoding, errors)
l.append(k + '=' + v)
else:
for k, v in query:
- k = quote_plus(str(k))
- if isinstance(v, str):
- v = quote_plus(v)
+ if isinstance(k, bytes):
+ k = quote_plus(k, safe)
+ else:
+ k = quote_plus(str(k), safe, encoding, errors)
+
+ if isinstance(v, bytes):
+ v = quote_plus(v, safe)
+ l.append(k + '=' + v)
+ elif isinstance(v, str):
+ v = quote_plus(v, safe, encoding, errors)
l.append(k + '=' + v)
else:
try:
@@ -607,12 +625,16 @@
x = len(v)
except TypeError:
# not a sequence
- v = quote_plus(str(v))
+ v = quote_plus(str(v), safe, encoding, errors)
l.append(k + '=' + v)
else:
# loop over the sequence
for elt in v:
- l.append(k + '=' + quote_plus(str(elt)))
+ if isinstance(elt, bytes):
+ elt = quote_plus(elt, safe)
+ else:
+ elt = quote_plus(str(elt), safe, encoding, errors)
+ l.append(k + '=' + elt)
return '&'.join(l)
# Utilities to parse URLs (most of these return None for missing parts):
diff --git a/Misc/NEWS b/Misc/NEWS
index 4b6d7d7..3b2ff32 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -468,6 +468,9 @@
Library
-------
+- Issue #5468: urlencode to handle bytes type and other encodings in its query
+ parameter. Patch by Dan Mahn.
+
- Issue #7673: Fix security vulnerability (CVE-2010-2089) in the audioop
module, ensure that the input string length is a multiple of the frame size