blob: 3a38dc14c9047d9bd4c9274e8bb098a579beed58 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032import collections
Cheryl Sabella0250de42018-04-25 16:51:54 -070033import warnings
Facundo Batista2ac5de22008-07-07 18:24:11 +000034
Jeremy Hylton1afc1692008-06-18 20:49:58 +000035__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000036 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
37 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030038 "unquote", "unquote_plus", "unquote_to_bytes",
39 "DefragResult", "ParseResult", "SplitResult",
40 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000041
Senthil Kumaran906f5332017-05-17 21:48:59 -070042# A classification of schemes.
43# The empty string classifies URLs with no scheme specified,
44# being the default value returned by “urlsplit” and “urlparse”.
45
46uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000047 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran906f5332017-05-17 21:48:59 -070048 'prospero', 'rtsp', 'rtspu', 'sftp',
Berker Peksagf6767482016-09-16 14:43:58 +030049 'svn', 'svn+ssh', 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070050
51uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000052 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
Senthil Kumaran906f5332017-05-17 21:48:59 -070053 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
Berker Peksagf6767482016-09-16 14:43:58 +030054 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
55 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070056
57uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000058 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaran906f5332017-05-17 21:48:59 -070059 'mms', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000060
Georg Brandla61b09f2012-08-24 18:15:29 +020061# These are not actually used anymore, but should stay for backwards
62# compatibility. (They are undocumented, but have a public-looking name.)
Senthil Kumaran906f5332017-05-17 21:48:59 -070063
Georg Brandla61b09f2012-08-24 18:15:29 +020064non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
65 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Senthil Kumaran906f5332017-05-17 21:48:59 -070066
67uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
68 'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
69
70uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandla61b09f2012-08-24 18:15:29 +020071 'nntp', 'wais', 'https', 'shttp', 'snews',
Senthil Kumaran906f5332017-05-17 21:48:59 -070072 'file', 'prospero']
Georg Brandla61b09f2012-08-24 18:15:29 +020073
Jeremy Hylton1afc1692008-06-18 20:49:58 +000074# Characters valid in scheme names
75scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
76 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
77 '0123456789'
78 '+-.')
79
Nick Coghlan9fc443c2010-11-30 15:48:08 +000080# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000081MAX_CACHE_SIZE = 20
82_parse_cache = {}
83
84def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000085 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000087 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000088
89
Nick Coghlan9fc443c2010-11-30 15:48:08 +000090# Helpers for bytes handling
91# For 3.2, we deliberately require applications that
92# handle improperly quoted URLs to do their own
93# decoding and encoding. If valid use cases are
94# presented, we may relax this by using latin-1
95# decoding internally for 3.3
96_implicit_encoding = 'ascii'
97_implicit_errors = 'strict'
98
99def _noop(obj):
100 return obj
101
102def _encode_result(obj, encoding=_implicit_encoding,
103 errors=_implicit_errors):
104 return obj.encode(encoding, errors)
105
106def _decode_args(args, encoding=_implicit_encoding,
107 errors=_implicit_errors):
108 return tuple(x.decode(encoding, errors) if x else '' for x in args)
109
110def _coerce_args(*args):
111 # Invokes decode if necessary to create str args
112 # and returns the coerced inputs along with
113 # an appropriate result coercion function
114 # - noop for str inputs
115 # - encoding function otherwise
116 str_input = isinstance(args[0], str)
117 for arg in args[1:]:
118 # We special-case the empty string to support the
119 # "scheme=''" default argument to some functions
120 if arg and isinstance(arg, str) != str_input:
121 raise TypeError("Cannot mix str and non-str arguments")
122 if str_input:
123 return args + (_noop,)
124 return _decode_args(args) + (_encode_result,)
125
126# Result objects are more helpful than simple tuples
127class _ResultMixinStr(object):
128 """Standard approach to encoding parsed results from str to bytes"""
129 __slots__ = ()
130
131 def encode(self, encoding='ascii', errors='strict'):
132 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
133
134
135class _ResultMixinBytes(object):
136 """Standard approach to decoding parsed results from bytes to str"""
137 __slots__ = ()
138
139 def decode(self, encoding='ascii', errors='strict'):
140 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
141
142
143class _NetlocResultMixinBase(object):
144 """Shared methods for the parsed result objects containing a netloc element"""
145 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000146
147 @property
148 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000149 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000150
151 @property
152 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000153 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000154
155 @property
156 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000157 hostname = self._hostinfo[0]
158 if not hostname:
Коренберг Маркfbd60512017-12-21 17:16:17 +0500159 return None
160 # Scoped IPv6 address may have zone info, which must not be lowercased
161 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
162 separator = '%' if isinstance(hostname, str) else b'%'
163 hostname, percent, zone = hostname.partition(separator)
164 return hostname.lower() + percent + zone
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000165
166 @property
167 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000168 port = self._hostinfo[1]
169 if port is not None:
Matt Eaton2cb46612018-03-20 01:41:37 -0500170 try:
171 port = int(port, 10)
172 except ValueError:
173 message = f'Port could not be cast to integer value as {port!r}'
174 raise ValueError(message) from None
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800175 if not ( 0 <= port <= 65535):
Robert Collinsdfa95c92015-08-10 09:53:30 +1200176 raise ValueError("Port out of range 0-65535")
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000177 return port
178
179
180class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
181 __slots__ = ()
182
183 @property
184 def _userinfo(self):
185 netloc = self.netloc
186 userinfo, have_info, hostinfo = netloc.rpartition('@')
187 if have_info:
188 username, have_password, password = userinfo.partition(':')
189 if not have_password:
190 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000191 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000192 username = password = None
193 return username, password
194
195 @property
196 def _hostinfo(self):
197 netloc = self.netloc
198 _, _, hostinfo = netloc.rpartition('@')
199 _, have_open_br, bracketed = hostinfo.partition('[')
200 if have_open_br:
201 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200202 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000203 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200204 hostname, _, port = hostinfo.partition(':')
205 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000206 port = None
207 return hostname, port
208
209
210class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
211 __slots__ = ()
212
213 @property
214 def _userinfo(self):
215 netloc = self.netloc
216 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
217 if have_info:
218 username, have_password, password = userinfo.partition(b':')
219 if not have_password:
220 password = None
221 else:
222 username = password = None
223 return username, password
224
225 @property
226 def _hostinfo(self):
227 netloc = self.netloc
228 _, _, hostinfo = netloc.rpartition(b'@')
229 _, have_open_br, bracketed = hostinfo.partition(b'[')
230 if have_open_br:
231 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200232 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000233 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200234 hostname, _, port = hostinfo.partition(b':')
235 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000236 port = None
237 return hostname, port
238
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000239
240from collections import namedtuple
241
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000242_DefragResultBase = namedtuple('DefragResult', 'url fragment')
Senthil Kumaran86f71092016-01-14 00:11:39 -0800243_SplitResultBase = namedtuple(
244 'SplitResult', 'scheme netloc path query fragment')
245_ParseResultBase = namedtuple(
246 'ParseResult', 'scheme netloc path params query fragment')
247
248_DefragResultBase.__doc__ = """
249DefragResult(url, fragment)
250
251A 2-tuple that contains the url without fragment identifier and the fragment
252identifier as a separate argument.
253"""
254
255_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
256
257_DefragResultBase.fragment.__doc__ = """
258Fragment identifier separated from URL, that allows indirect identification of a
259secondary resource by reference to a primary resource and additional identifying
260information.
261"""
262
263_SplitResultBase.__doc__ = """
264SplitResult(scheme, netloc, path, query, fragment)
265
266A 5-tuple that contains the different components of a URL. Similar to
267ParseResult, but does not split params.
268"""
269
270_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
271
272_SplitResultBase.netloc.__doc__ = """
273Network location where the request is made to.
274"""
275
276_SplitResultBase.path.__doc__ = """
277The hierarchical path, such as the path to a file to download.
278"""
279
280_SplitResultBase.query.__doc__ = """
281The query component, that contains non-hierarchical data, that along with data
282in path component, identifies a resource in the scope of URI's scheme and
283network location.
284"""
285
286_SplitResultBase.fragment.__doc__ = """
287Fragment identifier, that allows indirect identification of a secondary resource
288by reference to a primary resource and additional identifying information.
289"""
290
291_ParseResultBase.__doc__ = """
Cheryl Sabella0250de42018-04-25 16:51:54 -0700292ParseResult(scheme, netloc, path, params, query, fragment)
Senthil Kumaran86f71092016-01-14 00:11:39 -0800293
294A 6-tuple that contains components of a parsed URL.
295"""
296
297_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
298_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
299_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
300_ParseResultBase.params.__doc__ = """
301Parameters for last path element used to dereference the URI in order to provide
302access to perform some operation on the resource.
303"""
304
305_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
306_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
307
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000308
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000309# For backwards compatibility, alias _NetlocResultMixinStr
310# ResultBase is no longer part of the documented API, but it is
311# retained since deprecating it isn't worth the hassle
312ResultBase = _NetlocResultMixinStr
313
314# Structured result objects for string data
315class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000316 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000317 def geturl(self):
318 if self.fragment:
319 return self.url + '#' + self.fragment
320 else:
321 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000322
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000323class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
324 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000325 def geturl(self):
326 return urlunsplit(self)
327
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000328class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000329 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330 def geturl(self):
331 return urlunparse(self)
332
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000333# Structured result objects for bytes data
334class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
335 __slots__ = ()
336 def geturl(self):
337 if self.fragment:
338 return self.url + b'#' + self.fragment
339 else:
340 return self.url
341
342class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
343 __slots__ = ()
344 def geturl(self):
345 return urlunsplit(self)
346
347class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
348 __slots__ = ()
349 def geturl(self):
350 return urlunparse(self)
351
352# Set up the encode/decode result pairs
353def _fix_result_transcoding():
354 _result_pairs = (
355 (DefragResult, DefragResultBytes),
356 (SplitResult, SplitResultBytes),
357 (ParseResult, ParseResultBytes),
358 )
359 for _decoded, _encoded in _result_pairs:
360 _decoded._encoded_counterpart = _encoded
361 _encoded._decoded_counterpart = _decoded
362
363_fix_result_transcoding()
364del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000365
366def urlparse(url, scheme='', allow_fragments=True):
367 """Parse a URL into 6 components:
368 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
369 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
370 Note that we don't break the components up in smaller bits
371 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000372 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700373 splitresult = urlsplit(url, scheme, allow_fragments)
374 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000375 if scheme in uses_params and ';' in url:
376 url, params = _splitparams(url)
377 else:
378 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000379 result = ParseResult(scheme, netloc, url, params, query, fragment)
380 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000381
382def _splitparams(url):
383 if '/' in url:
384 i = url.find(';', url.rfind('/'))
385 if i < 0:
386 return url, ''
387 else:
388 i = url.find(';')
389 return url[:i], url[i+1:]
390
391def _splitnetloc(url, start=0):
392 delim = len(url) # position of end of domain part of url, default is end
393 for c in '/?#': # look for delimiters; the order is NOT important
394 wdelim = url.find(c, start) # find first of this delim
395 if wdelim >= 0: # if found
396 delim = min(delim, wdelim) # use earliest delim position
397 return url[start:delim], url[delim:] # return (domain, rest)
398
Steve Dower16e6f7d2019-03-07 08:02:26 -0800399def _checknetloc(netloc):
400 if not netloc or netloc.isascii():
401 return
402 # looking for characters like \u2100 that expand to 'a/c'
403 # IDNA uses NFKC equivalence, so normalize for this check
404 import unicodedata
Steve Dower8d0ef0b2019-06-04 08:55:30 -0700405 n = netloc.replace('@', '') # ignore characters already included
406 n = n.replace(':', '') # but not the surrounding text
407 n = n.replace('#', '')
Steve Dowerd537ab02019-04-30 12:03:02 +0000408 n = n.replace('?', '')
409 netloc2 = unicodedata.normalize('NFKC', n)
410 if n == netloc2:
Steve Dower16e6f7d2019-03-07 08:02:26 -0800411 return
Steve Dower16e6f7d2019-03-07 08:02:26 -0800412 for c in '/?#@:':
413 if c in netloc2:
Steve Dowerd537ab02019-04-30 12:03:02 +0000414 raise ValueError("netloc '" + netloc + "' contains invalid " +
Steve Dower16e6f7d2019-03-07 08:02:26 -0800415 "characters under NFKC normalization")
416
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000417def urlsplit(url, scheme='', allow_fragments=True):
418 """Parse a URL into 5 components:
419 <scheme>://<netloc>/<path>?<query>#<fragment>
420 Return a 5-tuple: (scheme, netloc, path, query, fragment).
421 Note that we don't break the components up in smaller bits
422 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000423 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000424 allow_fragments = bool(allow_fragments)
425 key = url, scheme, allow_fragments, type(url), type(scheme)
426 cached = _parse_cache.get(key, None)
427 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000428 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000429 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
430 clear_cache()
431 netloc = query = fragment = ''
432 i = url.find(':')
433 if i > 0:
434 if url[:i] == 'http': # optimize the common case
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000435 url = url[i+1:]
436 if url[:2] == '//':
437 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000438 if (('[' in netloc and ']' not in netloc) or
439 (']' in netloc and '[' not in netloc)):
440 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000441 if allow_fragments and '#' in url:
442 url, fragment = url.split('#', 1)
443 if '?' in url:
444 url, query = url.split('?', 1)
Steve Dower16e6f7d2019-03-07 08:02:26 -0800445 _checknetloc(netloc)
Oren Milman8df44ee2017-09-03 07:51:39 +0300446 v = SplitResult('http', netloc, url, query, fragment)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000447 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000448 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800449 for c in url[:i]:
450 if c not in scheme_chars:
451 break
452 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300453 # make sure "url" is not actually a port number (in which case
454 # "scheme" is really part of the path)
455 rest = url[i+1:]
456 if not rest or any(c not in '0123456789' for c in rest):
457 # not a port number
458 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800459
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000460 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000461 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000462 if (('[' in netloc and ']' not in netloc) or
463 (']' in netloc and '[' not in netloc)):
464 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800465 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000466 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800467 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000468 url, query = url.split('?', 1)
Steve Dower16e6f7d2019-03-07 08:02:26 -0800469 _checknetloc(netloc)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000470 v = SplitResult(scheme, netloc, url, query, fragment)
471 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000472 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000473
474def urlunparse(components):
475 """Put a parsed URL back together again. This may result in a
476 slightly different, but equivalent URL, if the URL that was parsed
477 originally had redundant delimiters, e.g. a ? with an empty query
478 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000479 scheme, netloc, url, params, query, fragment, _coerce_result = (
480 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000481 if params:
482 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000483 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000484
485def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000486 """Combine the elements of a tuple as returned by urlsplit() into a
487 complete URL as a string. The data argument can be any five-item iterable.
488 This may result in a slightly different, but equivalent URL, if the URL that
489 was parsed originally had unnecessary delimiters (for example, a ? with an
490 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000491 scheme, netloc, url, query, fragment, _coerce_result = (
492 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000493 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
494 if url and url[:1] != '/': url = '/' + url
495 url = '//' + (netloc or '') + url
496 if scheme:
497 url = scheme + ':' + url
498 if query:
499 url = url + '?' + query
500 if fragment:
501 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000502 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000503
504def urljoin(base, url, allow_fragments=True):
505 """Join a base URL and a possibly relative URL to form an absolute
506 interpretation of the latter."""
507 if not base:
508 return url
509 if not url:
510 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400511
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000512 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000513 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
514 urlparse(base, '', allow_fragments)
515 scheme, netloc, path, params, query, fragment = \
516 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400517
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000518 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000519 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000520 if scheme in uses_netloc:
521 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000522 return _coerce_result(urlunparse((scheme, netloc, path,
523 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000524 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400525
Senthil Kumarandca5b862010-12-17 04:48:45 +0000526 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000527 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000528 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000529 if not query:
530 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000531 return _coerce_result(urlunparse((scheme, netloc, path,
532 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400533
534 base_parts = bpath.split('/')
535 if base_parts[-1] != '':
536 # the last item is not a directory, so will not be taken into account
537 # in resolving the relative path
538 del base_parts[-1]
539
540 # for rfc3986, ignore all base path should the first character be root.
541 if path[:1] == '/':
542 segments = path.split('/')
543 else:
544 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800545 # filter out elements that would cause redundant slashes on re-joining
546 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300547 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400548
549 resolved_path = []
550
551 for seg in segments:
552 if seg == '..':
553 try:
554 resolved_path.pop()
555 except IndexError:
556 # ignore any .. segments that would otherwise cause an IndexError
557 # when popped from resolved_path if resolving for rfc3986
558 pass
559 elif seg == '.':
560 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000561 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400562 resolved_path.append(seg)
563
564 if segments[-1] in ('.', '..'):
565 # do some post-processing here. if the last segment was a relative dir,
566 # then we need to append the trailing '/'
567 resolved_path.append('')
568
569 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800570 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400571
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000572
573def urldefrag(url):
574 """Removes any existing fragment from URL.
575
576 Returns a tuple of the defragmented URL and the fragment. If
577 the URL contained no fragments, the second element is the
578 empty string.
579 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000580 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000581 if '#' in url:
582 s, n, p, a, q, frag = urlparse(url)
583 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000584 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000585 frag = ''
586 defrag = url
587 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000588
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200589_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100590_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200591
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000592def unquote_to_bytes(string):
593 """unquote_to_bytes('abc%20def') -> b'abc def'."""
594 # Note: strings are encoded as UTF-8. This is only an issue if it contains
595 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000596 if not string:
597 # Is it a string-like object?
598 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000599 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000600 if isinstance(string, str):
601 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200602 bits = string.split(b'%')
603 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000604 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200605 res = [bits[0]]
606 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100607 # Delay the initialization of the table to not waste memory
608 # if the function is never called
609 global _hextobyte
610 if _hextobyte is None:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +0200611 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100612 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200613 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000614 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200615 append(_hextobyte[item[:2]])
616 append(item[2:])
617 except KeyError:
618 append(b'%')
619 append(item)
620 return b''.join(res)
621
622_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000623
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000624def unquote(string, encoding='utf-8', errors='replace'):
625 """Replace %xx escapes by their single-character equivalent. The optional
626 encoding and errors parameters specify how to decode percent-encoded
627 sequences into Unicode characters, as accepted by the bytes.decode()
628 method.
629 By default, percent-encoded sequences are decoded with UTF-8, and invalid
630 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000631
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000632 unquote('abc%20def') -> 'abc def'.
633 """
Stein Karlsenaad2ee02019-10-14 12:36:29 +0200634 if isinstance(string, bytes):
635 return unquote_to_bytes(string).decode(encoding, errors)
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200636 if '%' not in string:
637 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000638 return string
639 if encoding is None:
640 encoding = 'utf-8'
641 if errors is None:
642 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200643 bits = _asciire.split(string)
644 res = [bits[0]]
645 append = res.append
646 for i in range(1, len(bits), 2):
647 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
648 append(bits[i + 1])
649 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000650
Senthil Kumaran257b9802017-04-04 21:19:43 -0700651
Victor Stinnerac71c542011-01-14 12:52:12 +0000652def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
matthewbelisle-wf20914482018-10-19 05:52:59 -0500653 encoding='utf-8', errors='replace', max_num_fields=None):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000654 """Parse a query given as a string argument.
655
656 Arguments:
657
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000658 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000659
660 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000661 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000662 A true value indicates that blanks should be retained as
663 blank strings. The default false value indicates that
664 blank values are to be ignored and treated as if they were
665 not included.
666
667 strict_parsing: flag indicating what to do with parsing errors.
668 If false (the default), errors are silently ignored.
669 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000670
671 encoding and errors: specify how to decode percent-encoded sequences
672 into Unicode characters, as accepted by the bytes.decode() method.
Senthil Kumaran257b9802017-04-04 21:19:43 -0700673
matthewbelisle-wf20914482018-10-19 05:52:59 -0500674 max_num_fields: int. If set, then throws a ValueError if there
675 are more than n fields read by parse_qsl().
676
Senthil Kumaran257b9802017-04-04 21:19:43 -0700677 Returns a dictionary.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000678 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700679 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000680 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
matthewbelisle-wf20914482018-10-19 05:52:59 -0500681 encoding=encoding, errors=errors,
682 max_num_fields=max_num_fields)
Victor Stinnerac71c542011-01-14 12:52:12 +0000683 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700684 if name in parsed_result:
685 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000686 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700687 parsed_result[name] = [value]
688 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000689
Senthil Kumaran257b9802017-04-04 21:19:43 -0700690
Victor Stinnerac71c542011-01-14 12:52:12 +0000691def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
matthewbelisle-wf20914482018-10-19 05:52:59 -0500692 encoding='utf-8', errors='replace', max_num_fields=None):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000693 """Parse a query given as a string argument.
694
Senthil Kumaran257b9802017-04-04 21:19:43 -0700695 Arguments:
Facundo Batistac469d4c2008-09-03 22:49:01 +0000696
Senthil Kumaran257b9802017-04-04 21:19:43 -0700697 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000698
Senthil Kumaran257b9802017-04-04 21:19:43 -0700699 keep_blank_values: flag indicating whether blank values in
700 percent-encoded queries should be treated as blank strings.
701 A true value indicates that blanks should be retained as blank
702 strings. The default false value indicates that blank values
703 are to be ignored and treated as if they were not included.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000704
Senthil Kumaran257b9802017-04-04 21:19:43 -0700705 strict_parsing: flag indicating what to do with parsing errors. If
706 false (the default), errors are silently ignored. If true,
707 errors raise a ValueError exception.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000708
Senthil Kumaran257b9802017-04-04 21:19:43 -0700709 encoding and errors: specify how to decode percent-encoded sequences
710 into Unicode characters, as accepted by the bytes.decode() method.
Victor Stinnerac71c542011-01-14 12:52:12 +0000711
matthewbelisle-wf20914482018-10-19 05:52:59 -0500712 max_num_fields: int. If set, then throws a ValueError
713 if there are more than n fields read by parse_qsl().
714
Senthil Kumaran257b9802017-04-04 21:19:43 -0700715 Returns a list, as G-d intended.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000716 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000717 qs, _coerce_result = _coerce_args(qs)
matthewbelisle-wf20914482018-10-19 05:52:59 -0500718
719 # If max_num_fields is defined then check that the number of fields
720 # is less than max_num_fields. This prevents a memory exhaustion DOS
721 # attack via post bodies with many fields.
722 if max_num_fields is not None:
723 num_fields = 1 + qs.count('&') + qs.count(';')
724 if max_num_fields < num_fields:
725 raise ValueError('Max number of fields exceeded')
726
Facundo Batistac469d4c2008-09-03 22:49:01 +0000727 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
728 r = []
729 for name_value in pairs:
730 if not name_value and not strict_parsing:
731 continue
732 nv = name_value.split('=', 1)
733 if len(nv) != 2:
734 if strict_parsing:
735 raise ValueError("bad query field: %r" % (name_value,))
736 # Handle case of a control-name with no equal sign
737 if keep_blank_values:
738 nv.append('')
739 else:
740 continue
741 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000742 name = nv[0].replace('+', ' ')
743 name = unquote(name, encoding=encoding, errors=errors)
744 name = _coerce_result(name)
745 value = nv[1].replace('+', ' ')
746 value = unquote(value, encoding=encoding, errors=errors)
747 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000748 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000749 return r
750
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000751def unquote_plus(string, encoding='utf-8', errors='replace'):
752 """Like unquote(), but also replace plus signs by spaces, as required for
753 unquoting HTML form values.
754
755 unquote_plus('%7e/abc+def') -> '~/abc def'
756 """
757 string = string.replace('+', ' ')
758 return unquote(string, encoding, errors)
759
760_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
761 b'abcdefghijklmnopqrstuvwxyz'
762 b'0123456789'
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530763 b'_.-~')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000764_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
765_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000766
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000767class Quoter(collections.defaultdict):
768 """A mapping from bytes (in range(0,256)) to strings.
769
770 String values are percent-encoded byte values, unless the key < 128, and
771 in the "safe" set (either the specified safe set, or default set).
772 """
773 # Keeps a cache internally, using defaultdict, for efficiency (lookups
774 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000775 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000776 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000777 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000778
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000779 def __repr__(self):
780 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300781 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000782
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000783 def __missing__(self, b):
784 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000785 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000786 self[b] = res
787 return res
788
789def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000790 """quote('abc def') -> 'abc%20def'
791
792 Each part of a URL, e.g. the path info, the query, etc., has a
Jörn Hees750d74f2019-04-10 02:31:18 +0200793 different set of reserved characters that must be quoted. The
794 quote function offers a cautious (not minimal) way to quote a
795 string for most of these parts.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000796
Jörn Hees750d74f2019-04-10 02:31:18 +0200797 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
798 the following (un)reserved characters.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000799
Jörn Hees750d74f2019-04-10 02:31:18 +0200800 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
801 reserved = gen-delims / sub-delims
802 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
803 sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
804 / "*" / "+" / "," / ";" / "="
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000805
Jörn Hees750d74f2019-04-10 02:31:18 +0200806 Each of the reserved characters is reserved in some component of a URL,
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000807 but not necessarily in all of them.
808
Jörn Hees750d74f2019-04-10 02:31:18 +0200809 The quote function %-escapes all characters that are neither in the
810 unreserved chars ("always safe") nor the additional chars set via the
811 safe arg.
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530812
Jörn Hees750d74f2019-04-10 02:31:18 +0200813 The default for the safe arg is '/'. The character is reserved, but in
814 typical usage the quote function is being called on a path where the
815 existing slash characters are to be preserved.
816
817 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
818 Now, "~" is included in the set of unreserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000819
R David Murray8c4e1122014-12-24 21:23:18 -0500820 string and safe may be either str or bytes objects. encoding and errors
821 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000822
823 The optional encoding and errors parameters specify how to deal with
824 non-ASCII characters, as accepted by the str.encode method.
825 By default, encoding='utf-8' (characters are encoded with UTF-8), and
826 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000827 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000828 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000829 if not string:
830 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000831 if encoding is None:
832 encoding = 'utf-8'
833 if errors is None:
834 errors = 'strict'
835 string = string.encode(encoding, errors)
836 else:
837 if encoding is not None:
838 raise TypeError("quote() doesn't support 'encoding' for bytes")
839 if errors is not None:
840 raise TypeError("quote() doesn't support 'errors' for bytes")
841 return quote_from_bytes(string, safe)
842
843def quote_plus(string, safe='', encoding=None, errors=None):
844 """Like quote(), but also replace ' ' with '+', as required for quoting
845 HTML form values. Plus signs in the original string are escaped unless
846 they are included in safe. It also does not have safe default to '/'.
847 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000848 # Check if ' ' in string, where string may either be a str or bytes. If
849 # there are no spaces, the regular quote will produce the right answer.
850 if ((isinstance(string, str) and ' ' not in string) or
851 (isinstance(string, bytes) and b' ' not in string)):
852 return quote(string, safe, encoding, errors)
853 if isinstance(safe, str):
854 space = ' '
855 else:
856 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000857 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000858 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000859
860def quote_from_bytes(bs, safe='/'):
861 """Like quote(), but accepts a bytes object rather than a str, and does
862 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800863 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000864 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000865 if not isinstance(bs, (bytes, bytearray)):
866 raise TypeError("quote_from_bytes() expected bytes")
867 if not bs:
868 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000869 if isinstance(safe, str):
870 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
871 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000872 else:
873 safe = bytes([c for c in safe if c < 128])
874 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
875 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000876 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000877 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000878 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000879 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
880 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000881
R David Murrayc17686f2015-05-17 20:44:50 -0400882def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
883 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700884 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000885
886 If any values in the query arg are sequences and doseq is true, each
887 sequence element is converted to a separate parameter.
888
889 If the query arg is a sequence of two-element tuples, the order of the
890 parameters in the output will match the order of parameters in the
891 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000892
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700893 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500894
R David Murrayc17686f2015-05-17 20:44:50 -0400895 The safe, encoding, and errors parameters are passed down to the function
896 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000897 """
898
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000899 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000900 query = query.items()
901 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000902 # It's a bother at times that strings and string-like objects are
903 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000904 try:
905 # non-sequence items should not work with len()
906 # non-empty strings will fail this
907 if len(query) and not isinstance(query[0], tuple):
908 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000909 # Zero-length sequences of all types will get here and succeed,
910 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000911 # allowed empty dicts that type of behavior probably should be
912 # preserved for consistency
913 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000914 ty, va, tb = sys.exc_info()
915 raise TypeError("not a valid non-string sequence "
916 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000917
918 l = []
919 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000920 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000921 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400922 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000923 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400924 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000925
926 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400927 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000928 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400929 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000930 l.append(k + '=' + v)
931 else:
932 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000933 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400934 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000935 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400936 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000937
938 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400939 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000940 l.append(k + '=' + v)
941 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400942 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000943 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000944 else:
945 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000946 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000947 x = len(v)
948 except TypeError:
949 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400950 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000951 l.append(k + '=' + v)
952 else:
953 # loop over the sequence
954 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000955 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400956 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000957 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400958 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000959 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000960 return '&'.join(l)
961
Cheryl Sabella0250de42018-04-25 16:51:54 -0700962
Georg Brandl13e89462008-07-01 19:56:00 +0000963def to_bytes(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -0700964 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
965 DeprecationWarning, stacklevel=2)
966 return _to_bytes(url)
967
968
969def _to_bytes(url):
Georg Brandl13e89462008-07-01 19:56:00 +0000970 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000971 # Most URL schemes require ASCII. If that changes, the conversion
972 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000973 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000974 if isinstance(url, str):
975 try:
976 url = url.encode("ASCII").decode()
977 except UnicodeError:
978 raise UnicodeError("URL " + repr(url) +
979 " contains non-ASCII characters")
980 return url
981
Cheryl Sabella0250de42018-04-25 16:51:54 -0700982
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000983def unwrap(url):
Rémi Lapeyre674ee122019-05-27 15:43:45 +0200984 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
Cheryl Sabella0250de42018-04-25 16:51:54 -0700985
Rémi Lapeyre674ee122019-05-27 15:43:45 +0200986 The string is returned unchanged if it's not a wrapped URL.
987 """
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000988 url = str(url).strip()
989 if url[:1] == '<' and url[-1:] == '>':
990 url = url[1:-1].strip()
Rémi Lapeyre674ee122019-05-27 15:43:45 +0200991 if url[:4] == 'URL:':
992 url = url[4:].strip()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000993 return url
994
Cheryl Sabella0250de42018-04-25 16:51:54 -0700995
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000996def splittype(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -0700997 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
998 "use urllib.parse.urlparse() instead",
999 DeprecationWarning, stacklevel=2)
1000 return _splittype(url)
1001
1002
1003_typeprog = None
1004def _splittype(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001005 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1006 global _typeprog
1007 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001008 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001009
1010 match = _typeprog.match(url)
1011 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001012 scheme, data = match.groups()
1013 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001014 return None, url
1015
Cheryl Sabella0250de42018-04-25 16:51:54 -07001016
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001017def splithost(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001018 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1019 "use urllib.parse.urlparse() instead",
1020 DeprecationWarning, stacklevel=2)
1021 return _splithost(url)
1022
1023
1024_hostprog = None
1025def _splithost(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001026 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1027 global _hostprog
1028 if _hostprog is None:
postmasters90e01e52017-06-20 06:02:44 -07001029 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001030
1031 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +00001032 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001033 host_port, path = match.groups()
1034 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +00001035 path = '/' + path
1036 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001037 return None, url
1038
Cheryl Sabella0250de42018-04-25 16:51:54 -07001039
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001040def splituser(host):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001041 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1042 "use urllib.parse.urlparse() instead",
1043 DeprecationWarning, stacklevel=2)
1044 return _splituser(host)
1045
1046
1047def _splituser(host):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001048 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001049 user, delim, host = host.rpartition('@')
1050 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001051
Cheryl Sabella0250de42018-04-25 16:51:54 -07001052
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001053def splitpasswd(user):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001054 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1055 "use urllib.parse.urlparse() instead",
1056 DeprecationWarning, stacklevel=2)
1057 return _splitpasswd(user)
1058
1059
1060def _splitpasswd(user):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001061 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001062 user, delim, passwd = user.partition(':')
1063 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001064
Cheryl Sabella0250de42018-04-25 16:51:54 -07001065
1066def splitport(host):
1067 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1068 "use urllib.parse.urlparse() instead",
1069 DeprecationWarning, stacklevel=2)
1070 return _splitport(host)
1071
1072
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001073# splittag('/path#tag') --> '/path', 'tag'
1074_portprog = None
Cheryl Sabella0250de42018-04-25 16:51:54 -07001075def _splitport(host):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001076 """splitport('host:port') --> 'host', 'port'."""
1077 global _portprog
1078 if _portprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001079 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001080
1081 match = _portprog.match(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +02001082 if match:
1083 host, port = match.groups()
1084 if port:
1085 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001086 return host, None
1087
Cheryl Sabella0250de42018-04-25 16:51:54 -07001088
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001089def splitnport(host, defport=-1):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001090 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1091 "use urllib.parse.urlparse() instead",
1092 DeprecationWarning, stacklevel=2)
1093 return _splitnport(host, defport)
1094
1095
1096def _splitnport(host, defport=-1):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001097 """Split host and port, returning numeric port.
1098 Return given default port if no ':' found; defaults to -1.
1099 Return numerical port if a valid number are found after ':'.
1100 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001101 host, delim, port = host.rpartition(':')
1102 if not delim:
1103 host = port
1104 elif port:
1105 try:
1106 nport = int(port)
1107 except ValueError:
1108 nport = None
1109 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001110 return host, defport
1111
Cheryl Sabella0250de42018-04-25 16:51:54 -07001112
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001113def splitquery(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001114 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1115 "use urllib.parse.urlparse() instead",
1116 DeprecationWarning, stacklevel=2)
1117 return _splitquery(url)
1118
1119
1120def _splitquery(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001121 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001122 path, delim, query = url.rpartition('?')
1123 if delim:
1124 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001125 return url, None
1126
Cheryl Sabella0250de42018-04-25 16:51:54 -07001127
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001128def splittag(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001129 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1130 "use urllib.parse.urlparse() instead",
1131 DeprecationWarning, stacklevel=2)
1132 return _splittag(url)
1133
1134
1135def _splittag(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001136 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001137 path, delim, tag = url.rpartition('#')
1138 if delim:
1139 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001140 return url, None
1141
Cheryl Sabella0250de42018-04-25 16:51:54 -07001142
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001143def splitattr(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001144 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1145 "use urllib.parse.urlparse() instead",
1146 DeprecationWarning, stacklevel=2)
1147 return _splitattr(url)
1148
1149
1150def _splitattr(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001151 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1152 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1153 words = url.split(';')
1154 return words[0], words[1:]
1155
Cheryl Sabella0250de42018-04-25 16:51:54 -07001156
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001157def splitvalue(attr):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001158 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1159 "use urllib.parse.parse_qsl() instead",
1160 DeprecationWarning, stacklevel=2)
1161 return _splitvalue(attr)
1162
1163
1164def _splitvalue(attr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001165 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001166 attr, delim, value = attr.partition('=')
1167 return attr, (value if delim else None)