blob: 4249163f0edde7c82bcbe2dd9b19c4231e007c4b [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Batuhan Taşkaya03615562020-04-10 17:46:36 +030032import types
Guido van Rossum52dbbb92008-08-18 21:44:30 +000033import collections
Cheryl Sabella0250de42018-04-25 16:51:54 -070034import warnings
Facundo Batista2ac5de22008-07-07 18:24:11 +000035
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000037 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
38 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030039 "unquote", "unquote_plus", "unquote_to_bytes",
40 "DefragResult", "ParseResult", "SplitResult",
41 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000042
Senthil Kumaran906f5332017-05-17 21:48:59 -070043# A classification of schemes.
44# The empty string classifies URLs with no scheme specified,
45# being the default value returned by “urlsplit” and “urlparse”.
46
47uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000048 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran906f5332017-05-17 21:48:59 -070049 'prospero', 'rtsp', 'rtspu', 'sftp',
Berker Peksagf6767482016-09-16 14:43:58 +030050 'svn', 'svn+ssh', 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070051
52uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
Senthil Kumaran906f5332017-05-17 21:48:59 -070054 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
Berker Peksagf6767482016-09-16 14:43:58 +030055 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
56 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070057
58uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaran906f5332017-05-17 21:48:59 -070060 'mms', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000061
Georg Brandla61b09f2012-08-24 18:15:29 +020062# These are not actually used anymore, but should stay for backwards
63# compatibility. (They are undocumented, but have a public-looking name.)
Senthil Kumaran906f5332017-05-17 21:48:59 -070064
Georg Brandla61b09f2012-08-24 18:15:29 +020065non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
66 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Senthil Kumaran906f5332017-05-17 21:48:59 -070067
68uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
69 'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
70
71uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandla61b09f2012-08-24 18:15:29 +020072 'nntp', 'wais', 'https', 'shttp', 'snews',
Senthil Kumaran906f5332017-05-17 21:48:59 -070073 'file', 'prospero']
Georg Brandla61b09f2012-08-24 18:15:29 +020074
Jeremy Hylton1afc1692008-06-18 20:49:58 +000075# Characters valid in scheme names
76scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
77 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
78 '0123456789'
79 '+-.')
80
Senthil Kumaran76cd81d2021-04-29 10:16:50 -070081# Unsafe bytes to be removed per WHATWG spec
82_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
83
Nick Coghlan9fc443c2010-11-30 15:48:08 +000084# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000085MAX_CACHE_SIZE = 20
86_parse_cache = {}
87
88def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000089 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000090 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000091 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000092
93
Nick Coghlan9fc443c2010-11-30 15:48:08 +000094# Helpers for bytes handling
95# For 3.2, we deliberately require applications that
96# handle improperly quoted URLs to do their own
97# decoding and encoding. If valid use cases are
98# presented, we may relax this by using latin-1
99# decoding internally for 3.3
100_implicit_encoding = 'ascii'
101_implicit_errors = 'strict'
102
103def _noop(obj):
104 return obj
105
106def _encode_result(obj, encoding=_implicit_encoding,
107 errors=_implicit_errors):
108 return obj.encode(encoding, errors)
109
110def _decode_args(args, encoding=_implicit_encoding,
111 errors=_implicit_errors):
112 return tuple(x.decode(encoding, errors) if x else '' for x in args)
113
114def _coerce_args(*args):
115 # Invokes decode if necessary to create str args
116 # and returns the coerced inputs along with
117 # an appropriate result coercion function
118 # - noop for str inputs
119 # - encoding function otherwise
120 str_input = isinstance(args[0], str)
121 for arg in args[1:]:
122 # We special-case the empty string to support the
123 # "scheme=''" default argument to some functions
124 if arg and isinstance(arg, str) != str_input:
125 raise TypeError("Cannot mix str and non-str arguments")
126 if str_input:
127 return args + (_noop,)
128 return _decode_args(args) + (_encode_result,)
129
130# Result objects are more helpful than simple tuples
131class _ResultMixinStr(object):
132 """Standard approach to encoding parsed results from str to bytes"""
133 __slots__ = ()
134
135 def encode(self, encoding='ascii', errors='strict'):
136 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
137
138
139class _ResultMixinBytes(object):
140 """Standard approach to decoding parsed results from bytes to str"""
141 __slots__ = ()
142
143 def decode(self, encoding='ascii', errors='strict'):
144 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
145
146
147class _NetlocResultMixinBase(object):
148 """Shared methods for the parsed result objects containing a netloc element"""
149 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000150
151 @property
152 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000153 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000154
155 @property
156 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000157 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000158
159 @property
160 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000161 hostname = self._hostinfo[0]
162 if not hostname:
Коренберг Маркfbd60512017-12-21 17:16:17 +0500163 return None
164 # Scoped IPv6 address may have zone info, which must not be lowercased
165 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
166 separator = '%' if isinstance(hostname, str) else b'%'
167 hostname, percent, zone = hostname.partition(separator)
168 return hostname.lower() + percent + zone
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000169
170 @property
171 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000172 port = self._hostinfo[1]
173 if port is not None:
Matt Eaton2cb46612018-03-20 01:41:37 -0500174 try:
175 port = int(port, 10)
176 except ValueError:
177 message = f'Port could not be cast to integer value as {port!r}'
178 raise ValueError(message) from None
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800179 if not ( 0 <= port <= 65535):
Robert Collinsdfa95c92015-08-10 09:53:30 +1200180 raise ValueError("Port out of range 0-65535")
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000181 return port
182
Batuhan Taşkaya03615562020-04-10 17:46:36 +0300183 __class_getitem__ = classmethod(types.GenericAlias)
184
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000185
186class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
187 __slots__ = ()
188
189 @property
190 def _userinfo(self):
191 netloc = self.netloc
192 userinfo, have_info, hostinfo = netloc.rpartition('@')
193 if have_info:
194 username, have_password, password = userinfo.partition(':')
195 if not have_password:
196 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000197 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000198 username = password = None
199 return username, password
200
201 @property
202 def _hostinfo(self):
203 netloc = self.netloc
204 _, _, hostinfo = netloc.rpartition('@')
205 _, have_open_br, bracketed = hostinfo.partition('[')
206 if have_open_br:
207 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200208 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000209 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200210 hostname, _, port = hostinfo.partition(':')
211 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000212 port = None
213 return hostname, port
214
215
216class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
217 __slots__ = ()
218
219 @property
220 def _userinfo(self):
221 netloc = self.netloc
222 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
223 if have_info:
224 username, have_password, password = userinfo.partition(b':')
225 if not have_password:
226 password = None
227 else:
228 username = password = None
229 return username, password
230
231 @property
232 def _hostinfo(self):
233 netloc = self.netloc
234 _, _, hostinfo = netloc.rpartition(b'@')
235 _, have_open_br, bracketed = hostinfo.partition(b'[')
236 if have_open_br:
237 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200238 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000239 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200240 hostname, _, port = hostinfo.partition(b':')
241 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000242 port = None
243 return hostname, port
244
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000245
246from collections import namedtuple
247
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000248_DefragResultBase = namedtuple('DefragResult', 'url fragment')
Senthil Kumaran86f71092016-01-14 00:11:39 -0800249_SplitResultBase = namedtuple(
250 'SplitResult', 'scheme netloc path query fragment')
251_ParseResultBase = namedtuple(
252 'ParseResult', 'scheme netloc path params query fragment')
253
254_DefragResultBase.__doc__ = """
255DefragResult(url, fragment)
256
257A 2-tuple that contains the url without fragment identifier and the fragment
258identifier as a separate argument.
259"""
260
261_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
262
263_DefragResultBase.fragment.__doc__ = """
264Fragment identifier separated from URL, that allows indirect identification of a
265secondary resource by reference to a primary resource and additional identifying
266information.
267"""
268
269_SplitResultBase.__doc__ = """
270SplitResult(scheme, netloc, path, query, fragment)
271
272A 5-tuple that contains the different components of a URL. Similar to
273ParseResult, but does not split params.
274"""
275
276_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
277
278_SplitResultBase.netloc.__doc__ = """
279Network location where the request is made to.
280"""
281
282_SplitResultBase.path.__doc__ = """
283The hierarchical path, such as the path to a file to download.
284"""
285
286_SplitResultBase.query.__doc__ = """
287The query component, that contains non-hierarchical data, that along with data
288in path component, identifies a resource in the scope of URI's scheme and
289network location.
290"""
291
292_SplitResultBase.fragment.__doc__ = """
293Fragment identifier, that allows indirect identification of a secondary resource
294by reference to a primary resource and additional identifying information.
295"""
296
297_ParseResultBase.__doc__ = """
Cheryl Sabella0250de42018-04-25 16:51:54 -0700298ParseResult(scheme, netloc, path, params, query, fragment)
Senthil Kumaran86f71092016-01-14 00:11:39 -0800299
300A 6-tuple that contains components of a parsed URL.
301"""
302
303_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
304_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
305_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
306_ParseResultBase.params.__doc__ = """
307Parameters for last path element used to dereference the URI in order to provide
308access to perform some operation on the resource.
309"""
310
311_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
312_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
313
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000314
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000315# For backwards compatibility, alias _NetlocResultMixinStr
316# ResultBase is no longer part of the documented API, but it is
317# retained since deprecating it isn't worth the hassle
318ResultBase = _NetlocResultMixinStr
319
320# Structured result objects for string data
321class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000322 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000323 def geturl(self):
324 if self.fragment:
325 return self.url + '#' + self.fragment
326 else:
327 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000328
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000329class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
330 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000331 def geturl(self):
332 return urlunsplit(self)
333
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000334class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000335 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000336 def geturl(self):
337 return urlunparse(self)
338
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000339# Structured result objects for bytes data
340class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
341 __slots__ = ()
342 def geturl(self):
343 if self.fragment:
344 return self.url + b'#' + self.fragment
345 else:
346 return self.url
347
348class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
349 __slots__ = ()
350 def geturl(self):
351 return urlunsplit(self)
352
353class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
354 __slots__ = ()
355 def geturl(self):
356 return urlunparse(self)
357
358# Set up the encode/decode result pairs
359def _fix_result_transcoding():
360 _result_pairs = (
361 (DefragResult, DefragResultBytes),
362 (SplitResult, SplitResultBytes),
363 (ParseResult, ParseResultBytes),
364 )
365 for _decoded, _encoded in _result_pairs:
366 _decoded._encoded_counterpart = _encoded
367 _encoded._decoded_counterpart = _decoded
368
369_fix_result_transcoding()
370del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000371
372def urlparse(url, scheme='', allow_fragments=True):
373 """Parse a URL into 6 components:
374 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
idomicc33bdbb2020-02-16 14:17:58 -0500375
376 The result is a named 6-tuple with fields corresponding to the
377 above. It is either a ParseResult or ParseResultBytes object,
378 depending on the type of the url parameter.
379
380 The username, password, hostname, and port sub-components of netloc
381 can also be accessed as attributes of the returned object.
382
383 The scheme argument provides the default value of the scheme
384 component when no scheme is found in url.
385
386 If allow_fragments is False, no attempt is made to separate the
387 fragment component from the previous component, which can be either
388 path or query.
389
390 Note that % escapes are not expanded.
391 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000392 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700393 splitresult = urlsplit(url, scheme, allow_fragments)
394 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000395 if scheme in uses_params and ';' in url:
396 url, params = _splitparams(url)
397 else:
398 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000399 result = ParseResult(scheme, netloc, url, params, query, fragment)
400 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000401
402def _splitparams(url):
403 if '/' in url:
404 i = url.find(';', url.rfind('/'))
405 if i < 0:
406 return url, ''
407 else:
408 i = url.find(';')
409 return url[:i], url[i+1:]
410
411def _splitnetloc(url, start=0):
412 delim = len(url) # position of end of domain part of url, default is end
413 for c in '/?#': # look for delimiters; the order is NOT important
414 wdelim = url.find(c, start) # find first of this delim
415 if wdelim >= 0: # if found
416 delim = min(delim, wdelim) # use earliest delim position
417 return url[start:delim], url[delim:] # return (domain, rest)
418
Steve Dower16e6f7d2019-03-07 08:02:26 -0800419def _checknetloc(netloc):
420 if not netloc or netloc.isascii():
421 return
422 # looking for characters like \u2100 that expand to 'a/c'
423 # IDNA uses NFKC equivalence, so normalize for this check
424 import unicodedata
Steve Dower8d0ef0b2019-06-04 08:55:30 -0700425 n = netloc.replace('@', '') # ignore characters already included
426 n = n.replace(':', '') # but not the surrounding text
427 n = n.replace('#', '')
Steve Dowerd537ab02019-04-30 12:03:02 +0000428 n = n.replace('?', '')
429 netloc2 = unicodedata.normalize('NFKC', n)
430 if n == netloc2:
Steve Dower16e6f7d2019-03-07 08:02:26 -0800431 return
Steve Dower16e6f7d2019-03-07 08:02:26 -0800432 for c in '/?#@:':
433 if c in netloc2:
Steve Dowerd537ab02019-04-30 12:03:02 +0000434 raise ValueError("netloc '" + netloc + "' contains invalid " +
Steve Dower16e6f7d2019-03-07 08:02:26 -0800435 "characters under NFKC normalization")
436
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000437def urlsplit(url, scheme='', allow_fragments=True):
438 """Parse a URL into 5 components:
439 <scheme>://<netloc>/<path>?<query>#<fragment>
idomicc33bdbb2020-02-16 14:17:58 -0500440
441 The result is a named 5-tuple with fields corresponding to the
442 above. It is either a SplitResult or SplitResultBytes object,
443 depending on the type of the url parameter.
444
445 The username, password, hostname, and port sub-components of netloc
446 can also be accessed as attributes of the returned object.
447
448 The scheme argument provides the default value of the scheme
449 component when no scheme is found in url.
450
451 If allow_fragments is False, no attempt is made to separate the
452 fragment component from the previous component, which can be either
453 path or query.
454
455 Note that % escapes are not expanded.
456 """
457
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000458 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000459 allow_fragments = bool(allow_fragments)
460 key = url, scheme, allow_fragments, type(url), type(scheme)
461 cached = _parse_cache.get(key, None)
462 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000463 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000464 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
465 clear_cache()
466 netloc = query = fragment = ''
467 i = url.find(':')
468 if i > 0:
Senthil Kumaran397eb442011-04-15 18:20:24 +0800469 for c in url[:i]:
470 if c not in scheme_chars:
471 break
472 else:
Tim Graham5a88d502019-10-18 09:07:20 -0400473 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran397eb442011-04-15 18:20:24 +0800474
Senthil Kumaran76cd81d2021-04-29 10:16:50 -0700475 for b in _UNSAFE_URL_BYTES_TO_REMOVE:
476 url = url.replace(b, "")
477
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000478 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000479 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000480 if (('[' in netloc and ']' not in netloc) or
481 (']' in netloc and '[' not in netloc)):
482 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800483 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000484 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800485 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000486 url, query = url.split('?', 1)
Steve Dower16e6f7d2019-03-07 08:02:26 -0800487 _checknetloc(netloc)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000488 v = SplitResult(scheme, netloc, url, query, fragment)
489 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000490 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000491
492def urlunparse(components):
493 """Put a parsed URL back together again. This may result in a
494 slightly different, but equivalent URL, if the URL that was parsed
495 originally had redundant delimiters, e.g. a ? with an empty query
496 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000497 scheme, netloc, url, params, query, fragment, _coerce_result = (
498 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000499 if params:
500 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000501 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000502
503def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000504 """Combine the elements of a tuple as returned by urlsplit() into a
505 complete URL as a string. The data argument can be any five-item iterable.
506 This may result in a slightly different, but equivalent URL, if the URL that
507 was parsed originally had unnecessary delimiters (for example, a ? with an
508 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000509 scheme, netloc, url, query, fragment, _coerce_result = (
510 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000511 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
512 if url and url[:1] != '/': url = '/' + url
513 url = '//' + (netloc or '') + url
514 if scheme:
515 url = scheme + ':' + url
516 if query:
517 url = url + '?' + query
518 if fragment:
519 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000520 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000521
522def urljoin(base, url, allow_fragments=True):
523 """Join a base URL and a possibly relative URL to form an absolute
524 interpretation of the latter."""
525 if not base:
526 return url
527 if not url:
528 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400529
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000530 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000531 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
532 urlparse(base, '', allow_fragments)
533 scheme, netloc, path, params, query, fragment = \
534 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400535
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000536 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000537 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000538 if scheme in uses_netloc:
539 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000540 return _coerce_result(urlunparse((scheme, netloc, path,
541 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000542 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400543
Senthil Kumarandca5b862010-12-17 04:48:45 +0000544 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000545 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000546 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000547 if not query:
548 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000549 return _coerce_result(urlunparse((scheme, netloc, path,
550 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400551
552 base_parts = bpath.split('/')
553 if base_parts[-1] != '':
554 # the last item is not a directory, so will not be taken into account
555 # in resolving the relative path
556 del base_parts[-1]
557
558 # for rfc3986, ignore all base path should the first character be root.
559 if path[:1] == '/':
560 segments = path.split('/')
561 else:
562 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800563 # filter out elements that would cause redundant slashes on re-joining
564 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300565 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400566
567 resolved_path = []
568
569 for seg in segments:
570 if seg == '..':
571 try:
572 resolved_path.pop()
573 except IndexError:
574 # ignore any .. segments that would otherwise cause an IndexError
575 # when popped from resolved_path if resolving for rfc3986
576 pass
577 elif seg == '.':
578 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000579 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400580 resolved_path.append(seg)
581
582 if segments[-1] in ('.', '..'):
583 # do some post-processing here. if the last segment was a relative dir,
584 # then we need to append the trailing '/'
585 resolved_path.append('')
586
587 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800588 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400589
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000590
591def urldefrag(url):
592 """Removes any existing fragment from URL.
593
594 Returns a tuple of the defragmented URL and the fragment. If
595 the URL contained no fragments, the second element is the
596 empty string.
597 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000598 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000599 if '#' in url:
600 s, n, p, a, q, frag = urlparse(url)
601 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000602 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000603 frag = ''
604 defrag = url
605 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000606
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200607_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100608_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200609
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000610def unquote_to_bytes(string):
611 """unquote_to_bytes('abc%20def') -> b'abc def'."""
612 # Note: strings are encoded as UTF-8. This is only an issue if it contains
613 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000614 if not string:
615 # Is it a string-like object?
616 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000617 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000618 if isinstance(string, str):
619 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200620 bits = string.split(b'%')
621 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000622 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200623 res = [bits[0]]
624 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100625 # Delay the initialization of the table to not waste memory
626 # if the function is never called
627 global _hextobyte
628 if _hextobyte is None:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +0200629 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100630 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200631 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000632 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200633 append(_hextobyte[item[:2]])
634 append(item[2:])
635 except KeyError:
636 append(b'%')
637 append(item)
638 return b''.join(res)
639
640_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000641
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000642def unquote(string, encoding='utf-8', errors='replace'):
643 """Replace %xx escapes by their single-character equivalent. The optional
644 encoding and errors parameters specify how to decode percent-encoded
645 sequences into Unicode characters, as accepted by the bytes.decode()
646 method.
647 By default, percent-encoded sequences are decoded with UTF-8, and invalid
648 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000649
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000650 unquote('abc%20def') -> 'abc def'.
651 """
Stein Karlsenaad2ee02019-10-14 12:36:29 +0200652 if isinstance(string, bytes):
653 return unquote_to_bytes(string).decode(encoding, errors)
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200654 if '%' not in string:
655 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000656 return string
657 if encoding is None:
658 encoding = 'utf-8'
659 if errors is None:
660 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200661 bits = _asciire.split(string)
662 res = [bits[0]]
663 append = res.append
664 for i in range(1, len(bits), 2):
665 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
666 append(bits[i + 1])
667 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000668
Senthil Kumaran257b9802017-04-04 21:19:43 -0700669
Victor Stinnerac71c542011-01-14 12:52:12 +0000670def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200671 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000672 """Parse a query given as a string argument.
673
674 Arguments:
675
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000676 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000677
678 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000679 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000680 A true value indicates that blanks should be retained as
681 blank strings. The default false value indicates that
682 blank values are to be ignored and treated as if they were
683 not included.
684
685 strict_parsing: flag indicating what to do with parsing errors.
686 If false (the default), errors are silently ignored.
687 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000688
689 encoding and errors: specify how to decode percent-encoded sequences
690 into Unicode characters, as accepted by the bytes.decode() method.
Senthil Kumaran257b9802017-04-04 21:19:43 -0700691
matthewbelisle-wf20914482018-10-19 05:52:59 -0500692 max_num_fields: int. If set, then throws a ValueError if there
693 are more than n fields read by parse_qsl().
694
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200695 separator: str. The symbol to use for separating the query arguments.
696 Defaults to &.
697
Senthil Kumaran257b9802017-04-04 21:19:43 -0700698 Returns a dictionary.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000699 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700700 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000701 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
matthewbelisle-wf20914482018-10-19 05:52:59 -0500702 encoding=encoding, errors=errors,
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200703 max_num_fields=max_num_fields, separator=separator)
Victor Stinnerac71c542011-01-14 12:52:12 +0000704 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700705 if name in parsed_result:
706 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000707 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700708 parsed_result[name] = [value]
709 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000710
Senthil Kumaran257b9802017-04-04 21:19:43 -0700711
Victor Stinnerac71c542011-01-14 12:52:12 +0000712def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200713 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000714 """Parse a query given as a string argument.
715
Senthil Kumaran257b9802017-04-04 21:19:43 -0700716 Arguments:
Facundo Batistac469d4c2008-09-03 22:49:01 +0000717
Senthil Kumaran257b9802017-04-04 21:19:43 -0700718 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000719
Senthil Kumaran257b9802017-04-04 21:19:43 -0700720 keep_blank_values: flag indicating whether blank values in
721 percent-encoded queries should be treated as blank strings.
722 A true value indicates that blanks should be retained as blank
723 strings. The default false value indicates that blank values
724 are to be ignored and treated as if they were not included.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000725
Senthil Kumaran257b9802017-04-04 21:19:43 -0700726 strict_parsing: flag indicating what to do with parsing errors. If
727 false (the default), errors are silently ignored. If true,
728 errors raise a ValueError exception.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000729
Senthil Kumaran257b9802017-04-04 21:19:43 -0700730 encoding and errors: specify how to decode percent-encoded sequences
731 into Unicode characters, as accepted by the bytes.decode() method.
Victor Stinnerac71c542011-01-14 12:52:12 +0000732
matthewbelisle-wf20914482018-10-19 05:52:59 -0500733 max_num_fields: int. If set, then throws a ValueError
734 if there are more than n fields read by parse_qsl().
735
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200736 separator: str. The symbol to use for separating the query arguments.
737 Defaults to &.
738
Senthil Kumaran257b9802017-04-04 21:19:43 -0700739 Returns a list, as G-d intended.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000740 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000741 qs, _coerce_result = _coerce_args(qs)
Ken Jinb38601d2021-04-11 21:26:09 +0800742 separator, _ = _coerce_args(separator)
matthewbelisle-wf20914482018-10-19 05:52:59 -0500743
Ken Jina2f06542021-02-16 01:00:20 +0800744 if not separator or (not isinstance(separator, (str, bytes))):
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200745 raise ValueError("Separator must be of type string or bytes.")
746
matthewbelisle-wf20914482018-10-19 05:52:59 -0500747 # If max_num_fields is defined then check that the number of fields
748 # is less than max_num_fields. This prevents a memory exhaustion DOS
749 # attack via post bodies with many fields.
750 if max_num_fields is not None:
Adam Goldschmidtfcbe0cb2021-02-15 00:41:57 +0200751 num_fields = 1 + qs.count(separator)
matthewbelisle-wf20914482018-10-19 05:52:59 -0500752 if max_num_fields < num_fields:
753 raise ValueError('Max number of fields exceeded')
754
Facundo Batistac469d4c2008-09-03 22:49:01 +0000755 r = []
Dong-hee Na6143fcd2021-05-01 04:01:55 +0900756 for name_value in qs.split(separator):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000757 if not name_value and not strict_parsing:
758 continue
759 nv = name_value.split('=', 1)
760 if len(nv) != 2:
761 if strict_parsing:
762 raise ValueError("bad query field: %r" % (name_value,))
763 # Handle case of a control-name with no equal sign
764 if keep_blank_values:
765 nv.append('')
766 else:
767 continue
768 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000769 name = nv[0].replace('+', ' ')
770 name = unquote(name, encoding=encoding, errors=errors)
771 name = _coerce_result(name)
772 value = nv[1].replace('+', ' ')
773 value = unquote(value, encoding=encoding, errors=errors)
774 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000775 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000776 return r
777
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000778def unquote_plus(string, encoding='utf-8', errors='replace'):
779 """Like unquote(), but also replace plus signs by spaces, as required for
780 unquoting HTML form values.
781
782 unquote_plus('%7e/abc+def') -> '~/abc def'
783 """
784 string = string.replace('+', ' ')
785 return unquote(string, encoding, errors)
786
787_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
788 b'abcdefghijklmnopqrstuvwxyz'
789 b'0123456789'
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530790 b'_.-~')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000791_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
792_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000793
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000794class Quoter(collections.defaultdict):
795 """A mapping from bytes (in range(0,256)) to strings.
796
797 String values are percent-encoded byte values, unless the key < 128, and
798 in the "safe" set (either the specified safe set, or default set).
799 """
800 # Keeps a cache internally, using defaultdict, for efficiency (lookups
801 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000802 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000803 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000804 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000805
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000806 def __repr__(self):
807 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300808 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000809
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000810 def __missing__(self, b):
811 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000812 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000813 self[b] = res
814 return res
815
816def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000817 """quote('abc def') -> 'abc%20def'
818
819 Each part of a URL, e.g. the path info, the query, etc., has a
Jörn Hees750d74f2019-04-10 02:31:18 +0200820 different set of reserved characters that must be quoted. The
821 quote function offers a cautious (not minimal) way to quote a
822 string for most of these parts.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000823
Jörn Hees750d74f2019-04-10 02:31:18 +0200824 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
825 the following (un)reserved characters.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000826
Jörn Hees750d74f2019-04-10 02:31:18 +0200827 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
828 reserved = gen-delims / sub-delims
829 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
830 sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
831 / "*" / "+" / "," / ";" / "="
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000832
Jörn Hees750d74f2019-04-10 02:31:18 +0200833 Each of the reserved characters is reserved in some component of a URL,
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000834 but not necessarily in all of them.
835
Jörn Hees750d74f2019-04-10 02:31:18 +0200836 The quote function %-escapes all characters that are neither in the
837 unreserved chars ("always safe") nor the additional chars set via the
838 safe arg.
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530839
Jörn Hees750d74f2019-04-10 02:31:18 +0200840 The default for the safe arg is '/'. The character is reserved, but in
841 typical usage the quote function is being called on a path where the
842 existing slash characters are to be preserved.
843
844 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
845 Now, "~" is included in the set of unreserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000846
R David Murray8c4e1122014-12-24 21:23:18 -0500847 string and safe may be either str or bytes objects. encoding and errors
848 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000849
850 The optional encoding and errors parameters specify how to deal with
851 non-ASCII characters, as accepted by the str.encode method.
852 By default, encoding='utf-8' (characters are encoded with UTF-8), and
853 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000854 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000855 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000856 if not string:
857 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000858 if encoding is None:
859 encoding = 'utf-8'
860 if errors is None:
861 errors = 'strict'
862 string = string.encode(encoding, errors)
863 else:
864 if encoding is not None:
865 raise TypeError("quote() doesn't support 'encoding' for bytes")
866 if errors is not None:
867 raise TypeError("quote() doesn't support 'errors' for bytes")
868 return quote_from_bytes(string, safe)
869
870def quote_plus(string, safe='', encoding=None, errors=None):
871 """Like quote(), but also replace ' ' with '+', as required for quoting
872 HTML form values. Plus signs in the original string are escaped unless
873 they are included in safe. It also does not have safe default to '/'.
874 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000875 # Check if ' ' in string, where string may either be a str or bytes. If
876 # there are no spaces, the regular quote will produce the right answer.
877 if ((isinstance(string, str) and ' ' not in string) or
878 (isinstance(string, bytes) and b' ' not in string)):
879 return quote(string, safe, encoding, errors)
880 if isinstance(safe, str):
881 space = ' '
882 else:
883 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000884 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000885 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000886
887def quote_from_bytes(bs, safe='/'):
888 """Like quote(), but accepts a bytes object rather than a str, and does
889 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800890 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000891 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000892 if not isinstance(bs, (bytes, bytearray)):
893 raise TypeError("quote_from_bytes() expected bytes")
894 if not bs:
895 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000896 if isinstance(safe, str):
897 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
898 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000899 else:
900 safe = bytes([c for c in safe if c < 128])
901 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
902 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000903 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000904 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000905 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000906 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
907 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000908
R David Murrayc17686f2015-05-17 20:44:50 -0400909def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
910 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700911 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000912
913 If any values in the query arg are sequences and doseq is true, each
914 sequence element is converted to a separate parameter.
915
916 If the query arg is a sequence of two-element tuples, the order of the
917 parameters in the output will match the order of parameters in the
918 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000919
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700920 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500921
R David Murrayc17686f2015-05-17 20:44:50 -0400922 The safe, encoding, and errors parameters are passed down to the function
923 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000924 """
925
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000926 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000927 query = query.items()
928 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000929 # It's a bother at times that strings and string-like objects are
930 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000931 try:
932 # non-sequence items should not work with len()
933 # non-empty strings will fail this
934 if len(query) and not isinstance(query[0], tuple):
935 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000936 # Zero-length sequences of all types will get here and succeed,
937 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000938 # allowed empty dicts that type of behavior probably should be
939 # preserved for consistency
940 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000941 ty, va, tb = sys.exc_info()
942 raise TypeError("not a valid non-string sequence "
943 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000944
945 l = []
946 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000947 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000948 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400949 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000950 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400951 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000952
953 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400954 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000955 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400956 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000957 l.append(k + '=' + v)
958 else:
959 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000960 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400961 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000962 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400963 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000964
965 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400966 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000967 l.append(k + '=' + v)
968 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400969 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000970 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000971 else:
972 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000973 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000974 x = len(v)
975 except TypeError:
976 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400977 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000978 l.append(k + '=' + v)
979 else:
980 # loop over the sequence
981 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000982 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400983 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000984 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400985 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000986 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000987 return '&'.join(l)
988
Cheryl Sabella0250de42018-04-25 16:51:54 -0700989
Georg Brandl13e89462008-07-01 19:56:00 +0000990def to_bytes(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -0700991 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
992 DeprecationWarning, stacklevel=2)
993 return _to_bytes(url)
994
995
996def _to_bytes(url):
Georg Brandl13e89462008-07-01 19:56:00 +0000997 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000998 # Most URL schemes require ASCII. If that changes, the conversion
999 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +00001000 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001001 if isinstance(url, str):
1002 try:
1003 url = url.encode("ASCII").decode()
1004 except UnicodeError:
1005 raise UnicodeError("URL " + repr(url) +
1006 " contains non-ASCII characters")
1007 return url
1008
Cheryl Sabella0250de42018-04-25 16:51:54 -07001009
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001010def unwrap(url):
Rémi Lapeyre674ee122019-05-27 15:43:45 +02001011 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
Cheryl Sabella0250de42018-04-25 16:51:54 -07001012
Rémi Lapeyre674ee122019-05-27 15:43:45 +02001013 The string is returned unchanged if it's not a wrapped URL.
1014 """
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001015 url = str(url).strip()
1016 if url[:1] == '<' and url[-1:] == '>':
1017 url = url[1:-1].strip()
Rémi Lapeyre674ee122019-05-27 15:43:45 +02001018 if url[:4] == 'URL:':
1019 url = url[4:].strip()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001020 return url
1021
Cheryl Sabella0250de42018-04-25 16:51:54 -07001022
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001023def splittype(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001024 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
1025 "use urllib.parse.urlparse() instead",
1026 DeprecationWarning, stacklevel=2)
1027 return _splittype(url)
1028
1029
1030_typeprog = None
1031def _splittype(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001032 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1033 global _typeprog
1034 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001035 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001036
1037 match = _typeprog.match(url)
1038 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001039 scheme, data = match.groups()
1040 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001041 return None, url
1042
Cheryl Sabella0250de42018-04-25 16:51:54 -07001043
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001044def splithost(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001045 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1046 "use urllib.parse.urlparse() instead",
1047 DeprecationWarning, stacklevel=2)
1048 return _splithost(url)
1049
1050
1051_hostprog = None
1052def _splithost(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001053 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1054 global _hostprog
1055 if _hostprog is None:
postmasters90e01e52017-06-20 06:02:44 -07001056 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001057
1058 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +00001059 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001060 host_port, path = match.groups()
1061 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +00001062 path = '/' + path
1063 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001064 return None, url
1065
Cheryl Sabella0250de42018-04-25 16:51:54 -07001066
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001067def splituser(host):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001068 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1069 "use urllib.parse.urlparse() instead",
1070 DeprecationWarning, stacklevel=2)
1071 return _splituser(host)
1072
1073
1074def _splituser(host):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001075 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001076 user, delim, host = host.rpartition('@')
1077 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001078
Cheryl Sabella0250de42018-04-25 16:51:54 -07001079
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001080def splitpasswd(user):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001081 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1082 "use urllib.parse.urlparse() instead",
1083 DeprecationWarning, stacklevel=2)
1084 return _splitpasswd(user)
1085
1086
1087def _splitpasswd(user):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001088 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001089 user, delim, passwd = user.partition(':')
1090 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001091
Cheryl Sabella0250de42018-04-25 16:51:54 -07001092
1093def splitport(host):
1094 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1095 "use urllib.parse.urlparse() instead",
1096 DeprecationWarning, stacklevel=2)
1097 return _splitport(host)
1098
1099
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001100# splittag('/path#tag') --> '/path', 'tag'
1101_portprog = None
Cheryl Sabella0250de42018-04-25 16:51:54 -07001102def _splitport(host):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001103 """splitport('host:port') --> 'host', 'port'."""
1104 global _portprog
1105 if _portprog is None:
Serhiy Storchaka6a265f02020-01-05 14:14:31 +02001106 _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001107
Serhiy Storchaka6a265f02020-01-05 14:14:31 +02001108 match = _portprog.fullmatch(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +02001109 if match:
1110 host, port = match.groups()
1111 if port:
1112 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001113 return host, None
1114
Cheryl Sabella0250de42018-04-25 16:51:54 -07001115
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001116def splitnport(host, defport=-1):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001117 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1118 "use urllib.parse.urlparse() instead",
1119 DeprecationWarning, stacklevel=2)
1120 return _splitnport(host, defport)
1121
1122
1123def _splitnport(host, defport=-1):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001124 """Split host and port, returning numeric port.
1125 Return given default port if no ':' found; defaults to -1.
1126 Return numerical port if a valid number are found after ':'.
1127 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001128 host, delim, port = host.rpartition(':')
1129 if not delim:
1130 host = port
1131 elif port:
1132 try:
1133 nport = int(port)
1134 except ValueError:
1135 nport = None
1136 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001137 return host, defport
1138
Cheryl Sabella0250de42018-04-25 16:51:54 -07001139
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001140def splitquery(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001141 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1142 "use urllib.parse.urlparse() instead",
1143 DeprecationWarning, stacklevel=2)
1144 return _splitquery(url)
1145
1146
1147def _splitquery(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001148 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001149 path, delim, query = url.rpartition('?')
1150 if delim:
1151 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001152 return url, None
1153
Cheryl Sabella0250de42018-04-25 16:51:54 -07001154
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001155def splittag(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001156 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1157 "use urllib.parse.urlparse() instead",
1158 DeprecationWarning, stacklevel=2)
1159 return _splittag(url)
1160
1161
1162def _splittag(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001163 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001164 path, delim, tag = url.rpartition('#')
1165 if delim:
1166 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001167 return url, None
1168
Cheryl Sabella0250de42018-04-25 16:51:54 -07001169
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001170def splitattr(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001171 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1172 "use urllib.parse.urlparse() instead",
1173 DeprecationWarning, stacklevel=2)
1174 return _splitattr(url)
1175
1176
1177def _splitattr(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001178 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1179 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1180 words = url.split(';')
1181 return words[0], words[1:]
1182
Cheryl Sabella0250de42018-04-25 16:51:54 -07001183
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001184def splitvalue(attr):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001185 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1186 "use urllib.parse.parse_qsl() instead",
1187 DeprecationWarning, stacklevel=2)
1188 return _splitvalue(attr)
1189
1190
1191def _splitvalue(attr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001192 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001193 attr, delim, value = attr.partition('=')
1194 return attr, (value if delim else None)