blob: 779278bac598a10ed4c19b9d2441d481957910f2 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032import collections
Cheryl Sabella0250de42018-04-25 16:51:54 -070033import warnings
Facundo Batista2ac5de22008-07-07 18:24:11 +000034
Jeremy Hylton1afc1692008-06-18 20:49:58 +000035__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000036 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
37 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030038 "unquote", "unquote_plus", "unquote_to_bytes",
39 "DefragResult", "ParseResult", "SplitResult",
40 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000041
Senthil Kumaran906f5332017-05-17 21:48:59 -070042# A classification of schemes.
43# The empty string classifies URLs with no scheme specified,
44# being the default value returned by “urlsplit” and “urlparse”.
45
46uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000047 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran906f5332017-05-17 21:48:59 -070048 'prospero', 'rtsp', 'rtspu', 'sftp',
Berker Peksagf6767482016-09-16 14:43:58 +030049 'svn', 'svn+ssh', 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070050
51uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000052 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
Senthil Kumaran906f5332017-05-17 21:48:59 -070053 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
Berker Peksagf6767482016-09-16 14:43:58 +030054 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
55 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070056
57uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000058 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaran906f5332017-05-17 21:48:59 -070059 'mms', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000060
Georg Brandla61b09f2012-08-24 18:15:29 +020061# These are not actually used anymore, but should stay for backwards
62# compatibility. (They are undocumented, but have a public-looking name.)
Senthil Kumaran906f5332017-05-17 21:48:59 -070063
Georg Brandla61b09f2012-08-24 18:15:29 +020064non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
65 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Senthil Kumaran906f5332017-05-17 21:48:59 -070066
67uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
68 'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
69
70uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandla61b09f2012-08-24 18:15:29 +020071 'nntp', 'wais', 'https', 'shttp', 'snews',
Senthil Kumaran906f5332017-05-17 21:48:59 -070072 'file', 'prospero']
Georg Brandla61b09f2012-08-24 18:15:29 +020073
Jeremy Hylton1afc1692008-06-18 20:49:58 +000074# Characters valid in scheme names
75scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
76 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
77 '0123456789'
78 '+-.')
79
Nick Coghlan9fc443c2010-11-30 15:48:08 +000080# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000081MAX_CACHE_SIZE = 20
82_parse_cache = {}
83
84def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000085 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000087 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000088
89
Nick Coghlan9fc443c2010-11-30 15:48:08 +000090# Helpers for bytes handling
91# For 3.2, we deliberately require applications that
92# handle improperly quoted URLs to do their own
93# decoding and encoding. If valid use cases are
94# presented, we may relax this by using latin-1
95# decoding internally for 3.3
96_implicit_encoding = 'ascii'
97_implicit_errors = 'strict'
98
99def _noop(obj):
100 return obj
101
102def _encode_result(obj, encoding=_implicit_encoding,
103 errors=_implicit_errors):
104 return obj.encode(encoding, errors)
105
106def _decode_args(args, encoding=_implicit_encoding,
107 errors=_implicit_errors):
108 return tuple(x.decode(encoding, errors) if x else '' for x in args)
109
110def _coerce_args(*args):
111 # Invokes decode if necessary to create str args
112 # and returns the coerced inputs along with
113 # an appropriate result coercion function
114 # - noop for str inputs
115 # - encoding function otherwise
116 str_input = isinstance(args[0], str)
117 for arg in args[1:]:
118 # We special-case the empty string to support the
119 # "scheme=''" default argument to some functions
120 if arg and isinstance(arg, str) != str_input:
121 raise TypeError("Cannot mix str and non-str arguments")
122 if str_input:
123 return args + (_noop,)
124 return _decode_args(args) + (_encode_result,)
125
126# Result objects are more helpful than simple tuples
127class _ResultMixinStr(object):
128 """Standard approach to encoding parsed results from str to bytes"""
129 __slots__ = ()
130
131 def encode(self, encoding='ascii', errors='strict'):
132 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
133
134
135class _ResultMixinBytes(object):
136 """Standard approach to decoding parsed results from bytes to str"""
137 __slots__ = ()
138
139 def decode(self, encoding='ascii', errors='strict'):
140 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
141
142
143class _NetlocResultMixinBase(object):
144 """Shared methods for the parsed result objects containing a netloc element"""
145 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000146
147 @property
148 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000149 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000150
151 @property
152 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000153 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000154
155 @property
156 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000157 hostname = self._hostinfo[0]
158 if not hostname:
Коренберг Маркfbd60512017-12-21 17:16:17 +0500159 return None
160 # Scoped IPv6 address may have zone info, which must not be lowercased
161 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
162 separator = '%' if isinstance(hostname, str) else b'%'
163 hostname, percent, zone = hostname.partition(separator)
164 return hostname.lower() + percent + zone
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000165
166 @property
167 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000168 port = self._hostinfo[1]
169 if port is not None:
Matt Eaton2cb46612018-03-20 01:41:37 -0500170 try:
171 port = int(port, 10)
172 except ValueError:
173 message = f'Port could not be cast to integer value as {port!r}'
174 raise ValueError(message) from None
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800175 if not ( 0 <= port <= 65535):
Robert Collinsdfa95c92015-08-10 09:53:30 +1200176 raise ValueError("Port out of range 0-65535")
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000177 return port
178
179
180class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
181 __slots__ = ()
182
183 @property
184 def _userinfo(self):
185 netloc = self.netloc
186 userinfo, have_info, hostinfo = netloc.rpartition('@')
187 if have_info:
188 username, have_password, password = userinfo.partition(':')
189 if not have_password:
190 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000191 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000192 username = password = None
193 return username, password
194
195 @property
196 def _hostinfo(self):
197 netloc = self.netloc
198 _, _, hostinfo = netloc.rpartition('@')
199 _, have_open_br, bracketed = hostinfo.partition('[')
200 if have_open_br:
201 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200202 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000203 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200204 hostname, _, port = hostinfo.partition(':')
205 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000206 port = None
207 return hostname, port
208
209
210class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
211 __slots__ = ()
212
213 @property
214 def _userinfo(self):
215 netloc = self.netloc
216 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
217 if have_info:
218 username, have_password, password = userinfo.partition(b':')
219 if not have_password:
220 password = None
221 else:
222 username = password = None
223 return username, password
224
225 @property
226 def _hostinfo(self):
227 netloc = self.netloc
228 _, _, hostinfo = netloc.rpartition(b'@')
229 _, have_open_br, bracketed = hostinfo.partition(b'[')
230 if have_open_br:
231 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200232 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000233 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200234 hostname, _, port = hostinfo.partition(b':')
235 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000236 port = None
237 return hostname, port
238
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000239
240from collections import namedtuple
241
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000242_DefragResultBase = namedtuple('DefragResult', 'url fragment')
Senthil Kumaran86f71092016-01-14 00:11:39 -0800243_SplitResultBase = namedtuple(
244 'SplitResult', 'scheme netloc path query fragment')
245_ParseResultBase = namedtuple(
246 'ParseResult', 'scheme netloc path params query fragment')
247
248_DefragResultBase.__doc__ = """
249DefragResult(url, fragment)
250
251A 2-tuple that contains the url without fragment identifier and the fragment
252identifier as a separate argument.
253"""
254
255_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
256
257_DefragResultBase.fragment.__doc__ = """
258Fragment identifier separated from URL, that allows indirect identification of a
259secondary resource by reference to a primary resource and additional identifying
260information.
261"""
262
263_SplitResultBase.__doc__ = """
264SplitResult(scheme, netloc, path, query, fragment)
265
266A 5-tuple that contains the different components of a URL. Similar to
267ParseResult, but does not split params.
268"""
269
270_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
271
272_SplitResultBase.netloc.__doc__ = """
273Network location where the request is made to.
274"""
275
276_SplitResultBase.path.__doc__ = """
277The hierarchical path, such as the path to a file to download.
278"""
279
280_SplitResultBase.query.__doc__ = """
281The query component, that contains non-hierarchical data, that along with data
282in path component, identifies a resource in the scope of URI's scheme and
283network location.
284"""
285
286_SplitResultBase.fragment.__doc__ = """
287Fragment identifier, that allows indirect identification of a secondary resource
288by reference to a primary resource and additional identifying information.
289"""
290
291_ParseResultBase.__doc__ = """
Cheryl Sabella0250de42018-04-25 16:51:54 -0700292ParseResult(scheme, netloc, path, params, query, fragment)
Senthil Kumaran86f71092016-01-14 00:11:39 -0800293
294A 6-tuple that contains components of a parsed URL.
295"""
296
297_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
298_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
299_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
300_ParseResultBase.params.__doc__ = """
301Parameters for last path element used to dereference the URI in order to provide
302access to perform some operation on the resource.
303"""
304
305_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
306_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
307
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000308
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000309# For backwards compatibility, alias _NetlocResultMixinStr
310# ResultBase is no longer part of the documented API, but it is
311# retained since deprecating it isn't worth the hassle
312ResultBase = _NetlocResultMixinStr
313
314# Structured result objects for string data
315class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000316 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000317 def geturl(self):
318 if self.fragment:
319 return self.url + '#' + self.fragment
320 else:
321 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000322
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000323class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
324 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000325 def geturl(self):
326 return urlunsplit(self)
327
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000328class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000329 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330 def geturl(self):
331 return urlunparse(self)
332
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000333# Structured result objects for bytes data
334class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
335 __slots__ = ()
336 def geturl(self):
337 if self.fragment:
338 return self.url + b'#' + self.fragment
339 else:
340 return self.url
341
342class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
343 __slots__ = ()
344 def geturl(self):
345 return urlunsplit(self)
346
347class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
348 __slots__ = ()
349 def geturl(self):
350 return urlunparse(self)
351
352# Set up the encode/decode result pairs
353def _fix_result_transcoding():
354 _result_pairs = (
355 (DefragResult, DefragResultBytes),
356 (SplitResult, SplitResultBytes),
357 (ParseResult, ParseResultBytes),
358 )
359 for _decoded, _encoded in _result_pairs:
360 _decoded._encoded_counterpart = _encoded
361 _encoded._decoded_counterpart = _decoded
362
363_fix_result_transcoding()
364del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000365
366def urlparse(url, scheme='', allow_fragments=True):
367 """Parse a URL into 6 components:
368 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
idomicc33bdbb2020-02-16 14:17:58 -0500369
370 The result is a named 6-tuple with fields corresponding to the
371 above. It is either a ParseResult or ParseResultBytes object,
372 depending on the type of the url parameter.
373
374 The username, password, hostname, and port sub-components of netloc
375 can also be accessed as attributes of the returned object.
376
377 The scheme argument provides the default value of the scheme
378 component when no scheme is found in url.
379
380 If allow_fragments is False, no attempt is made to separate the
381 fragment component from the previous component, which can be either
382 path or query.
383
384 Note that % escapes are not expanded.
385 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000386 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700387 splitresult = urlsplit(url, scheme, allow_fragments)
388 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000389 if scheme in uses_params and ';' in url:
390 url, params = _splitparams(url)
391 else:
392 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000393 result = ParseResult(scheme, netloc, url, params, query, fragment)
394 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000395
396def _splitparams(url):
397 if '/' in url:
398 i = url.find(';', url.rfind('/'))
399 if i < 0:
400 return url, ''
401 else:
402 i = url.find(';')
403 return url[:i], url[i+1:]
404
405def _splitnetloc(url, start=0):
406 delim = len(url) # position of end of domain part of url, default is end
407 for c in '/?#': # look for delimiters; the order is NOT important
408 wdelim = url.find(c, start) # find first of this delim
409 if wdelim >= 0: # if found
410 delim = min(delim, wdelim) # use earliest delim position
411 return url[start:delim], url[delim:] # return (domain, rest)
412
Steve Dower16e6f7d2019-03-07 08:02:26 -0800413def _checknetloc(netloc):
414 if not netloc or netloc.isascii():
415 return
416 # looking for characters like \u2100 that expand to 'a/c'
417 # IDNA uses NFKC equivalence, so normalize for this check
418 import unicodedata
Steve Dower8d0ef0b2019-06-04 08:55:30 -0700419 n = netloc.replace('@', '') # ignore characters already included
420 n = n.replace(':', '') # but not the surrounding text
421 n = n.replace('#', '')
Steve Dowerd537ab02019-04-30 12:03:02 +0000422 n = n.replace('?', '')
423 netloc2 = unicodedata.normalize('NFKC', n)
424 if n == netloc2:
Steve Dower16e6f7d2019-03-07 08:02:26 -0800425 return
Steve Dower16e6f7d2019-03-07 08:02:26 -0800426 for c in '/?#@:':
427 if c in netloc2:
Steve Dowerd537ab02019-04-30 12:03:02 +0000428 raise ValueError("netloc '" + netloc + "' contains invalid " +
Steve Dower16e6f7d2019-03-07 08:02:26 -0800429 "characters under NFKC normalization")
430
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000431def urlsplit(url, scheme='', allow_fragments=True):
432 """Parse a URL into 5 components:
433 <scheme>://<netloc>/<path>?<query>#<fragment>
idomicc33bdbb2020-02-16 14:17:58 -0500434
435 The result is a named 5-tuple with fields corresponding to the
436 above. It is either a SplitResult or SplitResultBytes object,
437 depending on the type of the url parameter.
438
439 The username, password, hostname, and port sub-components of netloc
440 can also be accessed as attributes of the returned object.
441
442 The scheme argument provides the default value of the scheme
443 component when no scheme is found in url.
444
445 If allow_fragments is False, no attempt is made to separate the
446 fragment component from the previous component, which can be either
447 path or query.
448
449 Note that % escapes are not expanded.
450 """
451
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000452 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000453 allow_fragments = bool(allow_fragments)
454 key = url, scheme, allow_fragments, type(url), type(scheme)
455 cached = _parse_cache.get(key, None)
456 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000457 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000458 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
459 clear_cache()
460 netloc = query = fragment = ''
461 i = url.find(':')
462 if i > 0:
Senthil Kumaran397eb442011-04-15 18:20:24 +0800463 for c in url[:i]:
464 if c not in scheme_chars:
465 break
466 else:
Tim Graham5a88d502019-10-18 09:07:20 -0400467 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran397eb442011-04-15 18:20:24 +0800468
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000469 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000470 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000471 if (('[' in netloc and ']' not in netloc) or
472 (']' in netloc and '[' not in netloc)):
473 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800474 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000475 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800476 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000477 url, query = url.split('?', 1)
Steve Dower16e6f7d2019-03-07 08:02:26 -0800478 _checknetloc(netloc)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000479 v = SplitResult(scheme, netloc, url, query, fragment)
480 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000481 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000482
483def urlunparse(components):
484 """Put a parsed URL back together again. This may result in a
485 slightly different, but equivalent URL, if the URL that was parsed
486 originally had redundant delimiters, e.g. a ? with an empty query
487 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000488 scheme, netloc, url, params, query, fragment, _coerce_result = (
489 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000490 if params:
491 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000492 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000493
494def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000495 """Combine the elements of a tuple as returned by urlsplit() into a
496 complete URL as a string. The data argument can be any five-item iterable.
497 This may result in a slightly different, but equivalent URL, if the URL that
498 was parsed originally had unnecessary delimiters (for example, a ? with an
499 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000500 scheme, netloc, url, query, fragment, _coerce_result = (
501 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000502 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
503 if url and url[:1] != '/': url = '/' + url
504 url = '//' + (netloc or '') + url
505 if scheme:
506 url = scheme + ':' + url
507 if query:
508 url = url + '?' + query
509 if fragment:
510 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000511 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000512
513def urljoin(base, url, allow_fragments=True):
514 """Join a base URL and a possibly relative URL to form an absolute
515 interpretation of the latter."""
516 if not base:
517 return url
518 if not url:
519 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400520
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000521 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000522 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
523 urlparse(base, '', allow_fragments)
524 scheme, netloc, path, params, query, fragment = \
525 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400526
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000527 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000528 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000529 if scheme in uses_netloc:
530 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000531 return _coerce_result(urlunparse((scheme, netloc, path,
532 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000533 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400534
Senthil Kumarandca5b862010-12-17 04:48:45 +0000535 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000536 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000537 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000538 if not query:
539 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000540 return _coerce_result(urlunparse((scheme, netloc, path,
541 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400542
543 base_parts = bpath.split('/')
544 if base_parts[-1] != '':
545 # the last item is not a directory, so will not be taken into account
546 # in resolving the relative path
547 del base_parts[-1]
548
549 # for rfc3986, ignore all base path should the first character be root.
550 if path[:1] == '/':
551 segments = path.split('/')
552 else:
553 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800554 # filter out elements that would cause redundant slashes on re-joining
555 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300556 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400557
558 resolved_path = []
559
560 for seg in segments:
561 if seg == '..':
562 try:
563 resolved_path.pop()
564 except IndexError:
565 # ignore any .. segments that would otherwise cause an IndexError
566 # when popped from resolved_path if resolving for rfc3986
567 pass
568 elif seg == '.':
569 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000570 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400571 resolved_path.append(seg)
572
573 if segments[-1] in ('.', '..'):
574 # do some post-processing here. if the last segment was a relative dir,
575 # then we need to append the trailing '/'
576 resolved_path.append('')
577
578 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800579 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400580
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000581
582def urldefrag(url):
583 """Removes any existing fragment from URL.
584
585 Returns a tuple of the defragmented URL and the fragment. If
586 the URL contained no fragments, the second element is the
587 empty string.
588 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000589 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000590 if '#' in url:
591 s, n, p, a, q, frag = urlparse(url)
592 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000593 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000594 frag = ''
595 defrag = url
596 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000597
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200598_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100599_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200600
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000601def unquote_to_bytes(string):
602 """unquote_to_bytes('abc%20def') -> b'abc def'."""
603 # Note: strings are encoded as UTF-8. This is only an issue if it contains
604 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000605 if not string:
606 # Is it a string-like object?
607 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000608 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000609 if isinstance(string, str):
610 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200611 bits = string.split(b'%')
612 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000613 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200614 res = [bits[0]]
615 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100616 # Delay the initialization of the table to not waste memory
617 # if the function is never called
618 global _hextobyte
619 if _hextobyte is None:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +0200620 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100621 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200622 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000623 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200624 append(_hextobyte[item[:2]])
625 append(item[2:])
626 except KeyError:
627 append(b'%')
628 append(item)
629 return b''.join(res)
630
631_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000632
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000633def unquote(string, encoding='utf-8', errors='replace'):
634 """Replace %xx escapes by their single-character equivalent. The optional
635 encoding and errors parameters specify how to decode percent-encoded
636 sequences into Unicode characters, as accepted by the bytes.decode()
637 method.
638 By default, percent-encoded sequences are decoded with UTF-8, and invalid
639 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000640
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000641 unquote('abc%20def') -> 'abc def'.
642 """
Stein Karlsenaad2ee02019-10-14 12:36:29 +0200643 if isinstance(string, bytes):
644 return unquote_to_bytes(string).decode(encoding, errors)
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200645 if '%' not in string:
646 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000647 return string
648 if encoding is None:
649 encoding = 'utf-8'
650 if errors is None:
651 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200652 bits = _asciire.split(string)
653 res = [bits[0]]
654 append = res.append
655 for i in range(1, len(bits), 2):
656 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
657 append(bits[i + 1])
658 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000659
Senthil Kumaran257b9802017-04-04 21:19:43 -0700660
Victor Stinnerac71c542011-01-14 12:52:12 +0000661def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
matthewbelisle-wf20914482018-10-19 05:52:59 -0500662 encoding='utf-8', errors='replace', max_num_fields=None):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000663 """Parse a query given as a string argument.
664
665 Arguments:
666
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000667 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000668
669 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000670 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000671 A true value indicates that blanks should be retained as
672 blank strings. The default false value indicates that
673 blank values are to be ignored and treated as if they were
674 not included.
675
676 strict_parsing: flag indicating what to do with parsing errors.
677 If false (the default), errors are silently ignored.
678 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000679
680 encoding and errors: specify how to decode percent-encoded sequences
681 into Unicode characters, as accepted by the bytes.decode() method.
Senthil Kumaran257b9802017-04-04 21:19:43 -0700682
matthewbelisle-wf20914482018-10-19 05:52:59 -0500683 max_num_fields: int. If set, then throws a ValueError if there
684 are more than n fields read by parse_qsl().
685
Senthil Kumaran257b9802017-04-04 21:19:43 -0700686 Returns a dictionary.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000687 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700688 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000689 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
matthewbelisle-wf20914482018-10-19 05:52:59 -0500690 encoding=encoding, errors=errors,
691 max_num_fields=max_num_fields)
Victor Stinnerac71c542011-01-14 12:52:12 +0000692 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700693 if name in parsed_result:
694 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000695 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700696 parsed_result[name] = [value]
697 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000698
Senthil Kumaran257b9802017-04-04 21:19:43 -0700699
Victor Stinnerac71c542011-01-14 12:52:12 +0000700def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
matthewbelisle-wf20914482018-10-19 05:52:59 -0500701 encoding='utf-8', errors='replace', max_num_fields=None):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000702 """Parse a query given as a string argument.
703
Senthil Kumaran257b9802017-04-04 21:19:43 -0700704 Arguments:
Facundo Batistac469d4c2008-09-03 22:49:01 +0000705
Senthil Kumaran257b9802017-04-04 21:19:43 -0700706 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000707
Senthil Kumaran257b9802017-04-04 21:19:43 -0700708 keep_blank_values: flag indicating whether blank values in
709 percent-encoded queries should be treated as blank strings.
710 A true value indicates that blanks should be retained as blank
711 strings. The default false value indicates that blank values
712 are to be ignored and treated as if they were not included.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000713
Senthil Kumaran257b9802017-04-04 21:19:43 -0700714 strict_parsing: flag indicating what to do with parsing errors. If
715 false (the default), errors are silently ignored. If true,
716 errors raise a ValueError exception.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000717
Senthil Kumaran257b9802017-04-04 21:19:43 -0700718 encoding and errors: specify how to decode percent-encoded sequences
719 into Unicode characters, as accepted by the bytes.decode() method.
Victor Stinnerac71c542011-01-14 12:52:12 +0000720
matthewbelisle-wf20914482018-10-19 05:52:59 -0500721 max_num_fields: int. If set, then throws a ValueError
722 if there are more than n fields read by parse_qsl().
723
Senthil Kumaran257b9802017-04-04 21:19:43 -0700724 Returns a list, as G-d intended.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000725 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000726 qs, _coerce_result = _coerce_args(qs)
matthewbelisle-wf20914482018-10-19 05:52:59 -0500727
728 # If max_num_fields is defined then check that the number of fields
729 # is less than max_num_fields. This prevents a memory exhaustion DOS
730 # attack via post bodies with many fields.
731 if max_num_fields is not None:
732 num_fields = 1 + qs.count('&') + qs.count(';')
733 if max_num_fields < num_fields:
734 raise ValueError('Max number of fields exceeded')
735
Facundo Batistac469d4c2008-09-03 22:49:01 +0000736 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
737 r = []
738 for name_value in pairs:
739 if not name_value and not strict_parsing:
740 continue
741 nv = name_value.split('=', 1)
742 if len(nv) != 2:
743 if strict_parsing:
744 raise ValueError("bad query field: %r" % (name_value,))
745 # Handle case of a control-name with no equal sign
746 if keep_blank_values:
747 nv.append('')
748 else:
749 continue
750 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000751 name = nv[0].replace('+', ' ')
752 name = unquote(name, encoding=encoding, errors=errors)
753 name = _coerce_result(name)
754 value = nv[1].replace('+', ' ')
755 value = unquote(value, encoding=encoding, errors=errors)
756 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000757 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000758 return r
759
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000760def unquote_plus(string, encoding='utf-8', errors='replace'):
761 """Like unquote(), but also replace plus signs by spaces, as required for
762 unquoting HTML form values.
763
764 unquote_plus('%7e/abc+def') -> '~/abc def'
765 """
766 string = string.replace('+', ' ')
767 return unquote(string, encoding, errors)
768
769_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
770 b'abcdefghijklmnopqrstuvwxyz'
771 b'0123456789'
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530772 b'_.-~')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000773_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
774_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000775
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000776class Quoter(collections.defaultdict):
777 """A mapping from bytes (in range(0,256)) to strings.
778
779 String values are percent-encoded byte values, unless the key < 128, and
780 in the "safe" set (either the specified safe set, or default set).
781 """
782 # Keeps a cache internally, using defaultdict, for efficiency (lookups
783 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000784 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000785 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000786 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000787
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000788 def __repr__(self):
789 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300790 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000791
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000792 def __missing__(self, b):
793 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000794 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000795 self[b] = res
796 return res
797
798def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000799 """quote('abc def') -> 'abc%20def'
800
801 Each part of a URL, e.g. the path info, the query, etc., has a
Jörn Hees750d74f2019-04-10 02:31:18 +0200802 different set of reserved characters that must be quoted. The
803 quote function offers a cautious (not minimal) way to quote a
804 string for most of these parts.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000805
Jörn Hees750d74f2019-04-10 02:31:18 +0200806 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
807 the following (un)reserved characters.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000808
Jörn Hees750d74f2019-04-10 02:31:18 +0200809 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
810 reserved = gen-delims / sub-delims
811 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
812 sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
813 / "*" / "+" / "," / ";" / "="
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000814
Jörn Hees750d74f2019-04-10 02:31:18 +0200815 Each of the reserved characters is reserved in some component of a URL,
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000816 but not necessarily in all of them.
817
Jörn Hees750d74f2019-04-10 02:31:18 +0200818 The quote function %-escapes all characters that are neither in the
819 unreserved chars ("always safe") nor the additional chars set via the
820 safe arg.
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530821
Jörn Hees750d74f2019-04-10 02:31:18 +0200822 The default for the safe arg is '/'. The character is reserved, but in
823 typical usage the quote function is being called on a path where the
824 existing slash characters are to be preserved.
825
826 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
827 Now, "~" is included in the set of unreserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000828
R David Murray8c4e1122014-12-24 21:23:18 -0500829 string and safe may be either str or bytes objects. encoding and errors
830 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000831
832 The optional encoding and errors parameters specify how to deal with
833 non-ASCII characters, as accepted by the str.encode method.
834 By default, encoding='utf-8' (characters are encoded with UTF-8), and
835 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000836 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000837 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000838 if not string:
839 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000840 if encoding is None:
841 encoding = 'utf-8'
842 if errors is None:
843 errors = 'strict'
844 string = string.encode(encoding, errors)
845 else:
846 if encoding is not None:
847 raise TypeError("quote() doesn't support 'encoding' for bytes")
848 if errors is not None:
849 raise TypeError("quote() doesn't support 'errors' for bytes")
850 return quote_from_bytes(string, safe)
851
852def quote_plus(string, safe='', encoding=None, errors=None):
853 """Like quote(), but also replace ' ' with '+', as required for quoting
854 HTML form values. Plus signs in the original string are escaped unless
855 they are included in safe. It also does not have safe default to '/'.
856 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000857 # Check if ' ' in string, where string may either be a str or bytes. If
858 # there are no spaces, the regular quote will produce the right answer.
859 if ((isinstance(string, str) and ' ' not in string) or
860 (isinstance(string, bytes) and b' ' not in string)):
861 return quote(string, safe, encoding, errors)
862 if isinstance(safe, str):
863 space = ' '
864 else:
865 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000866 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000867 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000868
869def quote_from_bytes(bs, safe='/'):
870 """Like quote(), but accepts a bytes object rather than a str, and does
871 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800872 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000873 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000874 if not isinstance(bs, (bytes, bytearray)):
875 raise TypeError("quote_from_bytes() expected bytes")
876 if not bs:
877 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000878 if isinstance(safe, str):
879 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
880 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000881 else:
882 safe = bytes([c for c in safe if c < 128])
883 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
884 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000885 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000886 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000887 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000888 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
889 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000890
R David Murrayc17686f2015-05-17 20:44:50 -0400891def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
892 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700893 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000894
895 If any values in the query arg are sequences and doseq is true, each
896 sequence element is converted to a separate parameter.
897
898 If the query arg is a sequence of two-element tuples, the order of the
899 parameters in the output will match the order of parameters in the
900 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000901
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700902 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500903
R David Murrayc17686f2015-05-17 20:44:50 -0400904 The safe, encoding, and errors parameters are passed down to the function
905 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000906 """
907
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000908 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000909 query = query.items()
910 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000911 # It's a bother at times that strings and string-like objects are
912 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000913 try:
914 # non-sequence items should not work with len()
915 # non-empty strings will fail this
916 if len(query) and not isinstance(query[0], tuple):
917 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000918 # Zero-length sequences of all types will get here and succeed,
919 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000920 # allowed empty dicts that type of behavior probably should be
921 # preserved for consistency
922 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000923 ty, va, tb = sys.exc_info()
924 raise TypeError("not a valid non-string sequence "
925 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000926
927 l = []
928 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000929 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000930 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400931 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000932 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400933 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000934
935 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400936 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000937 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400938 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000939 l.append(k + '=' + v)
940 else:
941 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000942 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400943 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000944 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400945 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000946
947 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400948 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000949 l.append(k + '=' + v)
950 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400951 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000952 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000953 else:
954 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000955 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000956 x = len(v)
957 except TypeError:
958 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400959 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000960 l.append(k + '=' + v)
961 else:
962 # loop over the sequence
963 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000964 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400965 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000966 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400967 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000968 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000969 return '&'.join(l)
970
Cheryl Sabella0250de42018-04-25 16:51:54 -0700971
Georg Brandl13e89462008-07-01 19:56:00 +0000972def to_bytes(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -0700973 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
974 DeprecationWarning, stacklevel=2)
975 return _to_bytes(url)
976
977
978def _to_bytes(url):
Georg Brandl13e89462008-07-01 19:56:00 +0000979 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000980 # Most URL schemes require ASCII. If that changes, the conversion
981 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000982 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000983 if isinstance(url, str):
984 try:
985 url = url.encode("ASCII").decode()
986 except UnicodeError:
987 raise UnicodeError("URL " + repr(url) +
988 " contains non-ASCII characters")
989 return url
990
Cheryl Sabella0250de42018-04-25 16:51:54 -0700991
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000992def unwrap(url):
Rémi Lapeyre674ee122019-05-27 15:43:45 +0200993 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
Cheryl Sabella0250de42018-04-25 16:51:54 -0700994
Rémi Lapeyre674ee122019-05-27 15:43:45 +0200995 The string is returned unchanged if it's not a wrapped URL.
996 """
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000997 url = str(url).strip()
998 if url[:1] == '<' and url[-1:] == '>':
999 url = url[1:-1].strip()
Rémi Lapeyre674ee122019-05-27 15:43:45 +02001000 if url[:4] == 'URL:':
1001 url = url[4:].strip()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001002 return url
1003
Cheryl Sabella0250de42018-04-25 16:51:54 -07001004
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001005def splittype(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001006 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
1007 "use urllib.parse.urlparse() instead",
1008 DeprecationWarning, stacklevel=2)
1009 return _splittype(url)
1010
1011
1012_typeprog = None
1013def _splittype(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001014 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1015 global _typeprog
1016 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001017 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018
1019 match = _typeprog.match(url)
1020 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001021 scheme, data = match.groups()
1022 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001023 return None, url
1024
Cheryl Sabella0250de42018-04-25 16:51:54 -07001025
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001026def splithost(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001027 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1028 "use urllib.parse.urlparse() instead",
1029 DeprecationWarning, stacklevel=2)
1030 return _splithost(url)
1031
1032
1033_hostprog = None
1034def _splithost(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001035 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1036 global _hostprog
1037 if _hostprog is None:
postmasters90e01e52017-06-20 06:02:44 -07001038 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001039
1040 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +00001041 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001042 host_port, path = match.groups()
1043 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +00001044 path = '/' + path
1045 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001046 return None, url
1047
Cheryl Sabella0250de42018-04-25 16:51:54 -07001048
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001049def splituser(host):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001050 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1051 "use urllib.parse.urlparse() instead",
1052 DeprecationWarning, stacklevel=2)
1053 return _splituser(host)
1054
1055
1056def _splituser(host):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001057 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001058 user, delim, host = host.rpartition('@')
1059 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001060
Cheryl Sabella0250de42018-04-25 16:51:54 -07001061
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001062def splitpasswd(user):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001063 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1064 "use urllib.parse.urlparse() instead",
1065 DeprecationWarning, stacklevel=2)
1066 return _splitpasswd(user)
1067
1068
1069def _splitpasswd(user):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001070 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001071 user, delim, passwd = user.partition(':')
1072 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001073
Cheryl Sabella0250de42018-04-25 16:51:54 -07001074
1075def splitport(host):
1076 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1077 "use urllib.parse.urlparse() instead",
1078 DeprecationWarning, stacklevel=2)
1079 return _splitport(host)
1080
1081
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001082# splittag('/path#tag') --> '/path', 'tag'
1083_portprog = None
Cheryl Sabella0250de42018-04-25 16:51:54 -07001084def _splitport(host):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001085 """splitport('host:port') --> 'host', 'port'."""
1086 global _portprog
1087 if _portprog is None:
Serhiy Storchaka6a265f02020-01-05 14:14:31 +02001088 _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001089
Serhiy Storchaka6a265f02020-01-05 14:14:31 +02001090 match = _portprog.fullmatch(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +02001091 if match:
1092 host, port = match.groups()
1093 if port:
1094 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001095 return host, None
1096
Cheryl Sabella0250de42018-04-25 16:51:54 -07001097
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001098def splitnport(host, defport=-1):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001099 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1100 "use urllib.parse.urlparse() instead",
1101 DeprecationWarning, stacklevel=2)
1102 return _splitnport(host, defport)
1103
1104
1105def _splitnport(host, defport=-1):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001106 """Split host and port, returning numeric port.
1107 Return given default port if no ':' found; defaults to -1.
1108 Return numerical port if a valid number are found after ':'.
1109 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001110 host, delim, port = host.rpartition(':')
1111 if not delim:
1112 host = port
1113 elif port:
1114 try:
1115 nport = int(port)
1116 except ValueError:
1117 nport = None
1118 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001119 return host, defport
1120
Cheryl Sabella0250de42018-04-25 16:51:54 -07001121
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001122def splitquery(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001123 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1124 "use urllib.parse.urlparse() instead",
1125 DeprecationWarning, stacklevel=2)
1126 return _splitquery(url)
1127
1128
1129def _splitquery(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001130 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001131 path, delim, query = url.rpartition('?')
1132 if delim:
1133 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001134 return url, None
1135
Cheryl Sabella0250de42018-04-25 16:51:54 -07001136
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001137def splittag(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001138 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1139 "use urllib.parse.urlparse() instead",
1140 DeprecationWarning, stacklevel=2)
1141 return _splittag(url)
1142
1143
1144def _splittag(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001145 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001146 path, delim, tag = url.rpartition('#')
1147 if delim:
1148 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001149 return url, None
1150
Cheryl Sabella0250de42018-04-25 16:51:54 -07001151
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001152def splitattr(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001153 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1154 "use urllib.parse.urlparse() instead",
1155 DeprecationWarning, stacklevel=2)
1156 return _splitattr(url)
1157
1158
1159def _splitattr(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001160 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1161 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1162 words = url.split(';')
1163 return words[0], words[1:]
1164
Cheryl Sabella0250de42018-04-25 16:51:54 -07001165
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001166def splitvalue(attr):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001167 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1168 "use urllib.parse.parse_qsl() instead",
1169 DeprecationWarning, stacklevel=2)
1170 return _splitvalue(attr)
1171
1172
1173def _splitvalue(attr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001174 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001175 attr, delim, value = attr.partition('=')
1176 return attr, (value if delim else None)