blob: 5bd067895bfa3d79e8accad94fbdb0e35c519d88 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Batuhan Taşkaya03615562020-04-10 17:46:36 +030032import types
Guido van Rossum52dbbb92008-08-18 21:44:30 +000033import collections
Cheryl Sabella0250de42018-04-25 16:51:54 -070034import warnings
Facundo Batista2ac5de22008-07-07 18:24:11 +000035
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000037 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
38 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030039 "unquote", "unquote_plus", "unquote_to_bytes",
40 "DefragResult", "ParseResult", "SplitResult",
41 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000042
Senthil Kumaran906f5332017-05-17 21:48:59 -070043# A classification of schemes.
44# The empty string classifies URLs with no scheme specified,
45# being the default value returned by “urlsplit” and “urlparse”.
46
47uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000048 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran906f5332017-05-17 21:48:59 -070049 'prospero', 'rtsp', 'rtspu', 'sftp',
Berker Peksagf6767482016-09-16 14:43:58 +030050 'svn', 'svn+ssh', 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070051
52uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
Senthil Kumaran906f5332017-05-17 21:48:59 -070054 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
Berker Peksagf6767482016-09-16 14:43:58 +030055 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
56 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070057
58uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaran906f5332017-05-17 21:48:59 -070060 'mms', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000061
Georg Brandla61b09f2012-08-24 18:15:29 +020062# These are not actually used anymore, but should stay for backwards
63# compatibility. (They are undocumented, but have a public-looking name.)
Senthil Kumaran906f5332017-05-17 21:48:59 -070064
Georg Brandla61b09f2012-08-24 18:15:29 +020065non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
66 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Senthil Kumaran906f5332017-05-17 21:48:59 -070067
68uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
69 'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
70
71uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandla61b09f2012-08-24 18:15:29 +020072 'nntp', 'wais', 'https', 'shttp', 'snews',
Senthil Kumaran906f5332017-05-17 21:48:59 -070073 'file', 'prospero']
Georg Brandla61b09f2012-08-24 18:15:29 +020074
Jeremy Hylton1afc1692008-06-18 20:49:58 +000075# Characters valid in scheme names
76scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
77 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
78 '0123456789'
79 '+-.')
80
Nick Coghlan9fc443c2010-11-30 15:48:08 +000081# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000082MAX_CACHE_SIZE = 20
83_parse_cache = {}
84
85def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000086 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000087 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000088 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000089
90
Nick Coghlan9fc443c2010-11-30 15:48:08 +000091# Helpers for bytes handling
92# For 3.2, we deliberately require applications that
93# handle improperly quoted URLs to do their own
94# decoding and encoding. If valid use cases are
95# presented, we may relax this by using latin-1
96# decoding internally for 3.3
97_implicit_encoding = 'ascii'
98_implicit_errors = 'strict'
99
100def _noop(obj):
101 return obj
102
103def _encode_result(obj, encoding=_implicit_encoding,
104 errors=_implicit_errors):
105 return obj.encode(encoding, errors)
106
107def _decode_args(args, encoding=_implicit_encoding,
108 errors=_implicit_errors):
109 return tuple(x.decode(encoding, errors) if x else '' for x in args)
110
111def _coerce_args(*args):
112 # Invokes decode if necessary to create str args
113 # and returns the coerced inputs along with
114 # an appropriate result coercion function
115 # - noop for str inputs
116 # - encoding function otherwise
117 str_input = isinstance(args[0], str)
118 for arg in args[1:]:
119 # We special-case the empty string to support the
120 # "scheme=''" default argument to some functions
121 if arg and isinstance(arg, str) != str_input:
122 raise TypeError("Cannot mix str and non-str arguments")
123 if str_input:
124 return args + (_noop,)
125 return _decode_args(args) + (_encode_result,)
126
127# Result objects are more helpful than simple tuples
128class _ResultMixinStr(object):
129 """Standard approach to encoding parsed results from str to bytes"""
130 __slots__ = ()
131
132 def encode(self, encoding='ascii', errors='strict'):
133 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
134
135
136class _ResultMixinBytes(object):
137 """Standard approach to decoding parsed results from bytes to str"""
138 __slots__ = ()
139
140 def decode(self, encoding='ascii', errors='strict'):
141 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
142
143
144class _NetlocResultMixinBase(object):
145 """Shared methods for the parsed result objects containing a netloc element"""
146 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000147
148 @property
149 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000150 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000151
152 @property
153 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000154 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000155
156 @property
157 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000158 hostname = self._hostinfo[0]
159 if not hostname:
Коренберг Маркfbd60512017-12-21 17:16:17 +0500160 return None
161 # Scoped IPv6 address may have zone info, which must not be lowercased
162 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
163 separator = '%' if isinstance(hostname, str) else b'%'
164 hostname, percent, zone = hostname.partition(separator)
165 return hostname.lower() + percent + zone
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000166
167 @property
168 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000169 port = self._hostinfo[1]
170 if port is not None:
Matt Eaton2cb46612018-03-20 01:41:37 -0500171 try:
172 port = int(port, 10)
173 except ValueError:
174 message = f'Port could not be cast to integer value as {port!r}'
175 raise ValueError(message) from None
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800176 if not ( 0 <= port <= 65535):
Robert Collinsdfa95c92015-08-10 09:53:30 +1200177 raise ValueError("Port out of range 0-65535")
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000178 return port
179
Batuhan Taşkaya03615562020-04-10 17:46:36 +0300180 __class_getitem__ = classmethod(types.GenericAlias)
181
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000182
183class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
184 __slots__ = ()
185
186 @property
187 def _userinfo(self):
188 netloc = self.netloc
189 userinfo, have_info, hostinfo = netloc.rpartition('@')
190 if have_info:
191 username, have_password, password = userinfo.partition(':')
192 if not have_password:
193 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000194 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000195 username = password = None
196 return username, password
197
198 @property
199 def _hostinfo(self):
200 netloc = self.netloc
201 _, _, hostinfo = netloc.rpartition('@')
202 _, have_open_br, bracketed = hostinfo.partition('[')
203 if have_open_br:
204 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200205 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000206 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200207 hostname, _, port = hostinfo.partition(':')
208 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000209 port = None
210 return hostname, port
211
212
213class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
214 __slots__ = ()
215
216 @property
217 def _userinfo(self):
218 netloc = self.netloc
219 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
220 if have_info:
221 username, have_password, password = userinfo.partition(b':')
222 if not have_password:
223 password = None
224 else:
225 username = password = None
226 return username, password
227
228 @property
229 def _hostinfo(self):
230 netloc = self.netloc
231 _, _, hostinfo = netloc.rpartition(b'@')
232 _, have_open_br, bracketed = hostinfo.partition(b'[')
233 if have_open_br:
234 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200235 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000236 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200237 hostname, _, port = hostinfo.partition(b':')
238 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000239 port = None
240 return hostname, port
241
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000242
243from collections import namedtuple
244
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000245_DefragResultBase = namedtuple('DefragResult', 'url fragment')
Senthil Kumaran86f71092016-01-14 00:11:39 -0800246_SplitResultBase = namedtuple(
247 'SplitResult', 'scheme netloc path query fragment')
248_ParseResultBase = namedtuple(
249 'ParseResult', 'scheme netloc path params query fragment')
250
251_DefragResultBase.__doc__ = """
252DefragResult(url, fragment)
253
254A 2-tuple that contains the url without fragment identifier and the fragment
255identifier as a separate argument.
256"""
257
258_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
259
260_DefragResultBase.fragment.__doc__ = """
261Fragment identifier separated from URL, that allows indirect identification of a
262secondary resource by reference to a primary resource and additional identifying
263information.
264"""
265
266_SplitResultBase.__doc__ = """
267SplitResult(scheme, netloc, path, query, fragment)
268
269A 5-tuple that contains the different components of a URL. Similar to
270ParseResult, but does not split params.
271"""
272
273_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
274
275_SplitResultBase.netloc.__doc__ = """
276Network location where the request is made to.
277"""
278
279_SplitResultBase.path.__doc__ = """
280The hierarchical path, such as the path to a file to download.
281"""
282
283_SplitResultBase.query.__doc__ = """
284The query component, that contains non-hierarchical data, that along with data
285in path component, identifies a resource in the scope of URI's scheme and
286network location.
287"""
288
289_SplitResultBase.fragment.__doc__ = """
290Fragment identifier, that allows indirect identification of a secondary resource
291by reference to a primary resource and additional identifying information.
292"""
293
294_ParseResultBase.__doc__ = """
Cheryl Sabella0250de42018-04-25 16:51:54 -0700295ParseResult(scheme, netloc, path, params, query, fragment)
Senthil Kumaran86f71092016-01-14 00:11:39 -0800296
297A 6-tuple that contains components of a parsed URL.
298"""
299
300_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
301_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
302_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
303_ParseResultBase.params.__doc__ = """
304Parameters for last path element used to dereference the URI in order to provide
305access to perform some operation on the resource.
306"""
307
308_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
309_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
310
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000311
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000312# For backwards compatibility, alias _NetlocResultMixinStr
313# ResultBase is no longer part of the documented API, but it is
314# retained since deprecating it isn't worth the hassle
315ResultBase = _NetlocResultMixinStr
316
317# Structured result objects for string data
318class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000319 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000320 def geturl(self):
321 if self.fragment:
322 return self.url + '#' + self.fragment
323 else:
324 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000325
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000326class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
327 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000328 def geturl(self):
329 return urlunsplit(self)
330
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000331class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000332 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000333 def geturl(self):
334 return urlunparse(self)
335
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000336# Structured result objects for bytes data
337class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
338 __slots__ = ()
339 def geturl(self):
340 if self.fragment:
341 return self.url + b'#' + self.fragment
342 else:
343 return self.url
344
345class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
346 __slots__ = ()
347 def geturl(self):
348 return urlunsplit(self)
349
350class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
351 __slots__ = ()
352 def geturl(self):
353 return urlunparse(self)
354
355# Set up the encode/decode result pairs
356def _fix_result_transcoding():
357 _result_pairs = (
358 (DefragResult, DefragResultBytes),
359 (SplitResult, SplitResultBytes),
360 (ParseResult, ParseResultBytes),
361 )
362 for _decoded, _encoded in _result_pairs:
363 _decoded._encoded_counterpart = _encoded
364 _encoded._decoded_counterpart = _decoded
365
366_fix_result_transcoding()
367del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000368
369def urlparse(url, scheme='', allow_fragments=True):
370 """Parse a URL into 6 components:
371 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
idomicc33bdbb2020-02-16 14:17:58 -0500372
373 The result is a named 6-tuple with fields corresponding to the
374 above. It is either a ParseResult or ParseResultBytes object,
375 depending on the type of the url parameter.
376
377 The username, password, hostname, and port sub-components of netloc
378 can also be accessed as attributes of the returned object.
379
380 The scheme argument provides the default value of the scheme
381 component when no scheme is found in url.
382
383 If allow_fragments is False, no attempt is made to separate the
384 fragment component from the previous component, which can be either
385 path or query.
386
387 Note that % escapes are not expanded.
388 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000389 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700390 splitresult = urlsplit(url, scheme, allow_fragments)
391 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000392 if scheme in uses_params and ';' in url:
393 url, params = _splitparams(url)
394 else:
395 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000396 result = ParseResult(scheme, netloc, url, params, query, fragment)
397 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000398
399def _splitparams(url):
400 if '/' in url:
401 i = url.find(';', url.rfind('/'))
402 if i < 0:
403 return url, ''
404 else:
405 i = url.find(';')
406 return url[:i], url[i+1:]
407
408def _splitnetloc(url, start=0):
409 delim = len(url) # position of end of domain part of url, default is end
410 for c in '/?#': # look for delimiters; the order is NOT important
411 wdelim = url.find(c, start) # find first of this delim
412 if wdelim >= 0: # if found
413 delim = min(delim, wdelim) # use earliest delim position
414 return url[start:delim], url[delim:] # return (domain, rest)
415
Steve Dower16e6f7d2019-03-07 08:02:26 -0800416def _checknetloc(netloc):
417 if not netloc or netloc.isascii():
418 return
419 # looking for characters like \u2100 that expand to 'a/c'
420 # IDNA uses NFKC equivalence, so normalize for this check
421 import unicodedata
Steve Dower8d0ef0b2019-06-04 08:55:30 -0700422 n = netloc.replace('@', '') # ignore characters already included
423 n = n.replace(':', '') # but not the surrounding text
424 n = n.replace('#', '')
Steve Dowerd537ab02019-04-30 12:03:02 +0000425 n = n.replace('?', '')
426 netloc2 = unicodedata.normalize('NFKC', n)
427 if n == netloc2:
Steve Dower16e6f7d2019-03-07 08:02:26 -0800428 return
Steve Dower16e6f7d2019-03-07 08:02:26 -0800429 for c in '/?#@:':
430 if c in netloc2:
Steve Dowerd537ab02019-04-30 12:03:02 +0000431 raise ValueError("netloc '" + netloc + "' contains invalid " +
Steve Dower16e6f7d2019-03-07 08:02:26 -0800432 "characters under NFKC normalization")
433
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000434def urlsplit(url, scheme='', allow_fragments=True):
435 """Parse a URL into 5 components:
436 <scheme>://<netloc>/<path>?<query>#<fragment>
idomicc33bdbb2020-02-16 14:17:58 -0500437
438 The result is a named 5-tuple with fields corresponding to the
439 above. It is either a SplitResult or SplitResultBytes object,
440 depending on the type of the url parameter.
441
442 The username, password, hostname, and port sub-components of netloc
443 can also be accessed as attributes of the returned object.
444
445 The scheme argument provides the default value of the scheme
446 component when no scheme is found in url.
447
448 If allow_fragments is False, no attempt is made to separate the
449 fragment component from the previous component, which can be either
450 path or query.
451
452 Note that % escapes are not expanded.
453 """
454
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000455 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000456 allow_fragments = bool(allow_fragments)
457 key = url, scheme, allow_fragments, type(url), type(scheme)
458 cached = _parse_cache.get(key, None)
459 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000460 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000461 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
462 clear_cache()
463 netloc = query = fragment = ''
464 i = url.find(':')
465 if i > 0:
Senthil Kumaran397eb442011-04-15 18:20:24 +0800466 for c in url[:i]:
467 if c not in scheme_chars:
468 break
469 else:
Tim Graham5a88d502019-10-18 09:07:20 -0400470 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran397eb442011-04-15 18:20:24 +0800471
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000472 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000473 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000474 if (('[' in netloc and ']' not in netloc) or
475 (']' in netloc and '[' not in netloc)):
476 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800477 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000478 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800479 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000480 url, query = url.split('?', 1)
Steve Dower16e6f7d2019-03-07 08:02:26 -0800481 _checknetloc(netloc)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000482 v = SplitResult(scheme, netloc, url, query, fragment)
483 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000484 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000485
486def urlunparse(components):
487 """Put a parsed URL back together again. This may result in a
488 slightly different, but equivalent URL, if the URL that was parsed
489 originally had redundant delimiters, e.g. a ? with an empty query
490 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000491 scheme, netloc, url, params, query, fragment, _coerce_result = (
492 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000493 if params:
494 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000495 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000496
497def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000498 """Combine the elements of a tuple as returned by urlsplit() into a
499 complete URL as a string. The data argument can be any five-item iterable.
500 This may result in a slightly different, but equivalent URL, if the URL that
501 was parsed originally had unnecessary delimiters (for example, a ? with an
502 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000503 scheme, netloc, url, query, fragment, _coerce_result = (
504 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000505 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
506 if url and url[:1] != '/': url = '/' + url
507 url = '//' + (netloc or '') + url
508 if scheme:
509 url = scheme + ':' + url
510 if query:
511 url = url + '?' + query
512 if fragment:
513 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000514 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000515
516def urljoin(base, url, allow_fragments=True):
517 """Join a base URL and a possibly relative URL to form an absolute
518 interpretation of the latter."""
519 if not base:
520 return url
521 if not url:
522 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400523
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000524 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000525 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
526 urlparse(base, '', allow_fragments)
527 scheme, netloc, path, params, query, fragment = \
528 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400529
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000531 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000532 if scheme in uses_netloc:
533 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000534 return _coerce_result(urlunparse((scheme, netloc, path,
535 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000536 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400537
Senthil Kumarandca5b862010-12-17 04:48:45 +0000538 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000539 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000540 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000541 if not query:
542 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000543 return _coerce_result(urlunparse((scheme, netloc, path,
544 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400545
546 base_parts = bpath.split('/')
547 if base_parts[-1] != '':
548 # the last item is not a directory, so will not be taken into account
549 # in resolving the relative path
550 del base_parts[-1]
551
552 # for rfc3986, ignore all base path should the first character be root.
553 if path[:1] == '/':
554 segments = path.split('/')
555 else:
556 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800557 # filter out elements that would cause redundant slashes on re-joining
558 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300559 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400560
561 resolved_path = []
562
563 for seg in segments:
564 if seg == '..':
565 try:
566 resolved_path.pop()
567 except IndexError:
568 # ignore any .. segments that would otherwise cause an IndexError
569 # when popped from resolved_path if resolving for rfc3986
570 pass
571 elif seg == '.':
572 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000573 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400574 resolved_path.append(seg)
575
576 if segments[-1] in ('.', '..'):
577 # do some post-processing here. if the last segment was a relative dir,
578 # then we need to append the trailing '/'
579 resolved_path.append('')
580
581 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800582 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400583
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000584
585def urldefrag(url):
586 """Removes any existing fragment from URL.
587
588 Returns a tuple of the defragmented URL and the fragment. If
589 the URL contained no fragments, the second element is the
590 empty string.
591 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000592 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000593 if '#' in url:
594 s, n, p, a, q, frag = urlparse(url)
595 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000596 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000597 frag = ''
598 defrag = url
599 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000600
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200601_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100602_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200603
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000604def unquote_to_bytes(string):
605 """unquote_to_bytes('abc%20def') -> b'abc def'."""
606 # Note: strings are encoded as UTF-8. This is only an issue if it contains
607 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000608 if not string:
609 # Is it a string-like object?
610 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000611 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000612 if isinstance(string, str):
613 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200614 bits = string.split(b'%')
615 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000616 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200617 res = [bits[0]]
618 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100619 # Delay the initialization of the table to not waste memory
620 # if the function is never called
621 global _hextobyte
622 if _hextobyte is None:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +0200623 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100624 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200625 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000626 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200627 append(_hextobyte[item[:2]])
628 append(item[2:])
629 except KeyError:
630 append(b'%')
631 append(item)
632 return b''.join(res)
633
634_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000635
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000636def unquote(string, encoding='utf-8', errors='replace'):
637 """Replace %xx escapes by their single-character equivalent. The optional
638 encoding and errors parameters specify how to decode percent-encoded
639 sequences into Unicode characters, as accepted by the bytes.decode()
640 method.
641 By default, percent-encoded sequences are decoded with UTF-8, and invalid
642 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000643
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000644 unquote('abc%20def') -> 'abc def'.
645 """
Stein Karlsenaad2ee02019-10-14 12:36:29 +0200646 if isinstance(string, bytes):
647 return unquote_to_bytes(string).decode(encoding, errors)
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200648 if '%' not in string:
649 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000650 return string
651 if encoding is None:
652 encoding = 'utf-8'
653 if errors is None:
654 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200655 bits = _asciire.split(string)
656 res = [bits[0]]
657 append = res.append
658 for i in range(1, len(bits), 2):
659 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
660 append(bits[i + 1])
661 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000662
Senthil Kumaran257b9802017-04-04 21:19:43 -0700663
Victor Stinnerac71c542011-01-14 12:52:12 +0000664def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200665 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000666 """Parse a query given as a string argument.
667
668 Arguments:
669
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000670 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000671
672 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000673 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000674 A true value indicates that blanks should be retained as
675 blank strings. The default false value indicates that
676 blank values are to be ignored and treated as if they were
677 not included.
678
679 strict_parsing: flag indicating what to do with parsing errors.
680 If false (the default), errors are silently ignored.
681 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000682
683 encoding and errors: specify how to decode percent-encoded sequences
684 into Unicode characters, as accepted by the bytes.decode() method.
Senthil Kumaran257b9802017-04-04 21:19:43 -0700685
matthewbelisle-wf20914482018-10-19 05:52:59 -0500686 max_num_fields: int. If set, then throws a ValueError if there
687 are more than n fields read by parse_qsl().
688
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200689 separator: str. The symbol to use for separating the query arguments.
690 Defaults to &.
691
Senthil Kumaran257b9802017-04-04 21:19:43 -0700692 Returns a dictionary.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000693 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700694 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000695 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
matthewbelisle-wf20914482018-10-19 05:52:59 -0500696 encoding=encoding, errors=errors,
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200697 max_num_fields=max_num_fields, separator=separator)
Victor Stinnerac71c542011-01-14 12:52:12 +0000698 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700699 if name in parsed_result:
700 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000701 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700702 parsed_result[name] = [value]
703 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000704
Senthil Kumaran257b9802017-04-04 21:19:43 -0700705
Victor Stinnerac71c542011-01-14 12:52:12 +0000706def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200707 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000708 """Parse a query given as a string argument.
709
Senthil Kumaran257b9802017-04-04 21:19:43 -0700710 Arguments:
Facundo Batistac469d4c2008-09-03 22:49:01 +0000711
Senthil Kumaran257b9802017-04-04 21:19:43 -0700712 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000713
Senthil Kumaran257b9802017-04-04 21:19:43 -0700714 keep_blank_values: flag indicating whether blank values in
715 percent-encoded queries should be treated as blank strings.
716 A true value indicates that blanks should be retained as blank
717 strings. The default false value indicates that blank values
718 are to be ignored and treated as if they were not included.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000719
Senthil Kumaran257b9802017-04-04 21:19:43 -0700720 strict_parsing: flag indicating what to do with parsing errors. If
721 false (the default), errors are silently ignored. If true,
722 errors raise a ValueError exception.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000723
Senthil Kumaran257b9802017-04-04 21:19:43 -0700724 encoding and errors: specify how to decode percent-encoded sequences
725 into Unicode characters, as accepted by the bytes.decode() method.
Victor Stinnerac71c542011-01-14 12:52:12 +0000726
matthewbelisle-wf20914482018-10-19 05:52:59 -0500727 max_num_fields: int. If set, then throws a ValueError
728 if there are more than n fields read by parse_qsl().
729
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200730 separator: str. The symbol to use for separating the query arguments.
731 Defaults to &.
732
Senthil Kumaran257b9802017-04-04 21:19:43 -0700733 Returns a list, as G-d intended.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000734 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000735 qs, _coerce_result = _coerce_args(qs)
matthewbelisle-wf20914482018-10-19 05:52:59 -0500736
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200737 if not separator or (not isinstance(separator, str)
738 and not isinstance(separator, bytes)):
739 raise ValueError("Separator must be of type string or bytes.")
740
matthewbelisle-wf20914482018-10-19 05:52:59 -0500741 # If max_num_fields is defined then check that the number of fields
742 # is less than max_num_fields. This prevents a memory exhaustion DOS
743 # attack via post bodies with many fields.
744 if max_num_fields is not None:
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200745 num_fields = 1 + qs.count(separator)
matthewbelisle-wf20914482018-10-19 05:52:59 -0500746 if max_num_fields < num_fields:
747 raise ValueError('Max number of fields exceeded')
748
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200749 pairs = [s1 for s1 in qs.split(separator)]
Facundo Batistac469d4c2008-09-03 22:49:01 +0000750 r = []
751 for name_value in pairs:
752 if not name_value and not strict_parsing:
753 continue
754 nv = name_value.split('=', 1)
755 if len(nv) != 2:
756 if strict_parsing:
757 raise ValueError("bad query field: %r" % (name_value,))
758 # Handle case of a control-name with no equal sign
759 if keep_blank_values:
760 nv.append('')
761 else:
762 continue
763 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000764 name = nv[0].replace('+', ' ')
765 name = unquote(name, encoding=encoding, errors=errors)
766 name = _coerce_result(name)
767 value = nv[1].replace('+', ' ')
768 value = unquote(value, encoding=encoding, errors=errors)
769 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000770 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000771 return r
772
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000773def unquote_plus(string, encoding='utf-8', errors='replace'):
774 """Like unquote(), but also replace plus signs by spaces, as required for
775 unquoting HTML form values.
776
777 unquote_plus('%7e/abc+def') -> '~/abc def'
778 """
779 string = string.replace('+', ' ')
780 return unquote(string, encoding, errors)
781
782_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
783 b'abcdefghijklmnopqrstuvwxyz'
784 b'0123456789'
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530785 b'_.-~')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000786_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
787_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000788
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000789class Quoter(collections.defaultdict):
790 """A mapping from bytes (in range(0,256)) to strings.
791
792 String values are percent-encoded byte values, unless the key < 128, and
793 in the "safe" set (either the specified safe set, or default set).
794 """
795 # Keeps a cache internally, using defaultdict, for efficiency (lookups
796 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000797 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000798 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000799 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000800
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000801 def __repr__(self):
802 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300803 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000804
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000805 def __missing__(self, b):
806 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000807 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000808 self[b] = res
809 return res
810
811def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000812 """quote('abc def') -> 'abc%20def'
813
814 Each part of a URL, e.g. the path info, the query, etc., has a
Jörn Hees750d74f2019-04-10 02:31:18 +0200815 different set of reserved characters that must be quoted. The
816 quote function offers a cautious (not minimal) way to quote a
817 string for most of these parts.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000818
Jörn Hees750d74f2019-04-10 02:31:18 +0200819 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
820 the following (un)reserved characters.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000821
Jörn Hees750d74f2019-04-10 02:31:18 +0200822 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
823 reserved = gen-delims / sub-delims
824 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
825 sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
826 / "*" / "+" / "," / ";" / "="
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000827
Jörn Hees750d74f2019-04-10 02:31:18 +0200828 Each of the reserved characters is reserved in some component of a URL,
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000829 but not necessarily in all of them.
830
Jörn Hees750d74f2019-04-10 02:31:18 +0200831 The quote function %-escapes all characters that are neither in the
832 unreserved chars ("always safe") nor the additional chars set via the
833 safe arg.
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530834
Jörn Hees750d74f2019-04-10 02:31:18 +0200835 The default for the safe arg is '/'. The character is reserved, but in
836 typical usage the quote function is being called on a path where the
837 existing slash characters are to be preserved.
838
839 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
840 Now, "~" is included in the set of unreserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000841
R David Murray8c4e1122014-12-24 21:23:18 -0500842 string and safe may be either str or bytes objects. encoding and errors
843 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000844
845 The optional encoding and errors parameters specify how to deal with
846 non-ASCII characters, as accepted by the str.encode method.
847 By default, encoding='utf-8' (characters are encoded with UTF-8), and
848 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000849 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000850 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000851 if not string:
852 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000853 if encoding is None:
854 encoding = 'utf-8'
855 if errors is None:
856 errors = 'strict'
857 string = string.encode(encoding, errors)
858 else:
859 if encoding is not None:
860 raise TypeError("quote() doesn't support 'encoding' for bytes")
861 if errors is not None:
862 raise TypeError("quote() doesn't support 'errors' for bytes")
863 return quote_from_bytes(string, safe)
864
865def quote_plus(string, safe='', encoding=None, errors=None):
866 """Like quote(), but also replace ' ' with '+', as required for quoting
867 HTML form values. Plus signs in the original string are escaped unless
868 they are included in safe. It also does not have safe default to '/'.
869 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000870 # Check if ' ' in string, where string may either be a str or bytes. If
871 # there are no spaces, the regular quote will produce the right answer.
872 if ((isinstance(string, str) and ' ' not in string) or
873 (isinstance(string, bytes) and b' ' not in string)):
874 return quote(string, safe, encoding, errors)
875 if isinstance(safe, str):
876 space = ' '
877 else:
878 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000879 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000880 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000881
882def quote_from_bytes(bs, safe='/'):
883 """Like quote(), but accepts a bytes object rather than a str, and does
884 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800885 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000886 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000887 if not isinstance(bs, (bytes, bytearray)):
888 raise TypeError("quote_from_bytes() expected bytes")
889 if not bs:
890 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000891 if isinstance(safe, str):
892 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
893 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000894 else:
895 safe = bytes([c for c in safe if c < 128])
896 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
897 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000898 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000899 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000900 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000901 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
902 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000903
R David Murrayc17686f2015-05-17 20:44:50 -0400904def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
905 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700906 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000907
908 If any values in the query arg are sequences and doseq is true, each
909 sequence element is converted to a separate parameter.
910
911 If the query arg is a sequence of two-element tuples, the order of the
912 parameters in the output will match the order of parameters in the
913 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000914
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700915 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500916
R David Murrayc17686f2015-05-17 20:44:50 -0400917 The safe, encoding, and errors parameters are passed down to the function
918 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000919 """
920
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000921 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000922 query = query.items()
923 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000924 # It's a bother at times that strings and string-like objects are
925 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000926 try:
927 # non-sequence items should not work with len()
928 # non-empty strings will fail this
929 if len(query) and not isinstance(query[0], tuple):
930 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000931 # Zero-length sequences of all types will get here and succeed,
932 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000933 # allowed empty dicts that type of behavior probably should be
934 # preserved for consistency
935 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000936 ty, va, tb = sys.exc_info()
937 raise TypeError("not a valid non-string sequence "
938 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000939
940 l = []
941 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000942 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000943 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400944 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000945 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400946 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000947
948 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400949 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000950 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400951 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000952 l.append(k + '=' + v)
953 else:
954 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000955 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400956 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000957 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400958 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000959
960 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400961 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000962 l.append(k + '=' + v)
963 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400964 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000965 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000966 else:
967 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000968 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000969 x = len(v)
970 except TypeError:
971 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400972 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000973 l.append(k + '=' + v)
974 else:
975 # loop over the sequence
976 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000977 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400978 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000979 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400980 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000981 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000982 return '&'.join(l)
983
Cheryl Sabella0250de42018-04-25 16:51:54 -0700984
Georg Brandl13e89462008-07-01 19:56:00 +0000985def to_bytes(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -0700986 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
987 DeprecationWarning, stacklevel=2)
988 return _to_bytes(url)
989
990
991def _to_bytes(url):
Georg Brandl13e89462008-07-01 19:56:00 +0000992 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000993 # Most URL schemes require ASCII. If that changes, the conversion
994 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000995 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000996 if isinstance(url, str):
997 try:
998 url = url.encode("ASCII").decode()
999 except UnicodeError:
1000 raise UnicodeError("URL " + repr(url) +
1001 " contains non-ASCII characters")
1002 return url
1003
Cheryl Sabella0250de42018-04-25 16:51:54 -07001004
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001005def unwrap(url):
Rémi Lapeyre674ee122019-05-27 15:43:45 +02001006 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
Cheryl Sabella0250de42018-04-25 16:51:54 -07001007
Rémi Lapeyre674ee122019-05-27 15:43:45 +02001008 The string is returned unchanged if it's not a wrapped URL.
1009 """
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001010 url = str(url).strip()
1011 if url[:1] == '<' and url[-1:] == '>':
1012 url = url[1:-1].strip()
Rémi Lapeyre674ee122019-05-27 15:43:45 +02001013 if url[:4] == 'URL:':
1014 url = url[4:].strip()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001015 return url
1016
Cheryl Sabella0250de42018-04-25 16:51:54 -07001017
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018def splittype(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001019 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
1020 "use urllib.parse.urlparse() instead",
1021 DeprecationWarning, stacklevel=2)
1022 return _splittype(url)
1023
1024
1025_typeprog = None
1026def _splittype(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001027 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1028 global _typeprog
1029 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001030 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001031
1032 match = _typeprog.match(url)
1033 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001034 scheme, data = match.groups()
1035 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001036 return None, url
1037
Cheryl Sabella0250de42018-04-25 16:51:54 -07001038
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001039def splithost(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001040 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1041 "use urllib.parse.urlparse() instead",
1042 DeprecationWarning, stacklevel=2)
1043 return _splithost(url)
1044
1045
1046_hostprog = None
1047def _splithost(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001048 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1049 global _hostprog
1050 if _hostprog is None:
postmasters90e01e52017-06-20 06:02:44 -07001051 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001052
1053 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +00001054 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001055 host_port, path = match.groups()
1056 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +00001057 path = '/' + path
1058 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001059 return None, url
1060
Cheryl Sabella0250de42018-04-25 16:51:54 -07001061
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001062def splituser(host):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001063 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1064 "use urllib.parse.urlparse() instead",
1065 DeprecationWarning, stacklevel=2)
1066 return _splituser(host)
1067
1068
1069def _splituser(host):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001070 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001071 user, delim, host = host.rpartition('@')
1072 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001073
Cheryl Sabella0250de42018-04-25 16:51:54 -07001074
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001075def splitpasswd(user):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001076 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1077 "use urllib.parse.urlparse() instead",
1078 DeprecationWarning, stacklevel=2)
1079 return _splitpasswd(user)
1080
1081
1082def _splitpasswd(user):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001083 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001084 user, delim, passwd = user.partition(':')
1085 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001086
Cheryl Sabella0250de42018-04-25 16:51:54 -07001087
1088def splitport(host):
1089 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1090 "use urllib.parse.urlparse() instead",
1091 DeprecationWarning, stacklevel=2)
1092 return _splitport(host)
1093
1094
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001095# splittag('/path#tag') --> '/path', 'tag'
1096_portprog = None
Cheryl Sabella0250de42018-04-25 16:51:54 -07001097def _splitport(host):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001098 """splitport('host:port') --> 'host', 'port'."""
1099 global _portprog
1100 if _portprog is None:
Serhiy Storchaka6a265f02020-01-05 14:14:31 +02001101 _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001102
Serhiy Storchaka6a265f02020-01-05 14:14:31 +02001103 match = _portprog.fullmatch(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +02001104 if match:
1105 host, port = match.groups()
1106 if port:
1107 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001108 return host, None
1109
Cheryl Sabella0250de42018-04-25 16:51:54 -07001110
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001111def splitnport(host, defport=-1):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001112 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1113 "use urllib.parse.urlparse() instead",
1114 DeprecationWarning, stacklevel=2)
1115 return _splitnport(host, defport)
1116
1117
1118def _splitnport(host, defport=-1):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001119 """Split host and port, returning numeric port.
1120 Return given default port if no ':' found; defaults to -1.
1121 Return numerical port if a valid number are found after ':'.
1122 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001123 host, delim, port = host.rpartition(':')
1124 if not delim:
1125 host = port
1126 elif port:
1127 try:
1128 nport = int(port)
1129 except ValueError:
1130 nport = None
1131 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001132 return host, defport
1133
Cheryl Sabella0250de42018-04-25 16:51:54 -07001134
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001135def splitquery(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001136 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1137 "use urllib.parse.urlparse() instead",
1138 DeprecationWarning, stacklevel=2)
1139 return _splitquery(url)
1140
1141
1142def _splitquery(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001143 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001144 path, delim, query = url.rpartition('?')
1145 if delim:
1146 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001147 return url, None
1148
Cheryl Sabella0250de42018-04-25 16:51:54 -07001149
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001150def splittag(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001151 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1152 "use urllib.parse.urlparse() instead",
1153 DeprecationWarning, stacklevel=2)
1154 return _splittag(url)
1155
1156
1157def _splittag(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001158 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001159 path, delim, tag = url.rpartition('#')
1160 if delim:
1161 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001162 return url, None
1163
Cheryl Sabella0250de42018-04-25 16:51:54 -07001164
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001165def splitattr(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001166 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1167 "use urllib.parse.urlparse() instead",
1168 DeprecationWarning, stacklevel=2)
1169 return _splitattr(url)
1170
1171
1172def _splitattr(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001173 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1174 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1175 words = url.split(';')
1176 return words[0], words[1:]
1177
Cheryl Sabella0250de42018-04-25 16:51:54 -07001178
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001179def splitvalue(attr):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001180 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1181 "use urllib.parse.parse_qsl() instead",
1182 DeprecationWarning, stacklevel=2)
1183 return _splitvalue(attr)
1184
1185
1186def _splitvalue(attr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001187 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001188 attr, delim, value = attr.partition('=')
1189 return attr, (value if delim else None)