blob: ea897c3032257b290724c21c2e4c20542175b42c [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Batuhan Taşkaya03615562020-04-10 17:46:36 +030032import types
Guido van Rossum52dbbb92008-08-18 21:44:30 +000033import collections
Cheryl Sabella0250de42018-04-25 16:51:54 -070034import warnings
Facundo Batista2ac5de22008-07-07 18:24:11 +000035
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000037 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
38 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030039 "unquote", "unquote_plus", "unquote_to_bytes",
40 "DefragResult", "ParseResult", "SplitResult",
41 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000042
Senthil Kumaran906f5332017-05-17 21:48:59 -070043# A classification of schemes.
44# The empty string classifies URLs with no scheme specified,
45# being the default value returned by “urlsplit” and “urlparse”.
46
47uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000048 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran906f5332017-05-17 21:48:59 -070049 'prospero', 'rtsp', 'rtspu', 'sftp',
Berker Peksagf6767482016-09-16 14:43:58 +030050 'svn', 'svn+ssh', 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070051
52uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
Senthil Kumaran906f5332017-05-17 21:48:59 -070054 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
Berker Peksagf6767482016-09-16 14:43:58 +030055 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
56 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070057
58uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaran906f5332017-05-17 21:48:59 -070060 'mms', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000061
Georg Brandla61b09f2012-08-24 18:15:29 +020062# These are not actually used anymore, but should stay for backwards
63# compatibility. (They are undocumented, but have a public-looking name.)
Senthil Kumaran906f5332017-05-17 21:48:59 -070064
Georg Brandla61b09f2012-08-24 18:15:29 +020065non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
66 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Senthil Kumaran906f5332017-05-17 21:48:59 -070067
68uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
69 'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
70
71uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandla61b09f2012-08-24 18:15:29 +020072 'nntp', 'wais', 'https', 'shttp', 'snews',
Senthil Kumaran906f5332017-05-17 21:48:59 -070073 'file', 'prospero']
Georg Brandla61b09f2012-08-24 18:15:29 +020074
Jeremy Hylton1afc1692008-06-18 20:49:58 +000075# Characters valid in scheme names
76scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
77 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
78 '0123456789'
79 '+-.')
80
Nick Coghlan9fc443c2010-11-30 15:48:08 +000081# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000082MAX_CACHE_SIZE = 20
83_parse_cache = {}
84
85def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000086 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000087 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000088 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000089
90
Nick Coghlan9fc443c2010-11-30 15:48:08 +000091# Helpers for bytes handling
92# For 3.2, we deliberately require applications that
93# handle improperly quoted URLs to do their own
94# decoding and encoding. If valid use cases are
95# presented, we may relax this by using latin-1
96# decoding internally for 3.3
97_implicit_encoding = 'ascii'
98_implicit_errors = 'strict'
99
100def _noop(obj):
101 return obj
102
103def _encode_result(obj, encoding=_implicit_encoding,
104 errors=_implicit_errors):
105 return obj.encode(encoding, errors)
106
107def _decode_args(args, encoding=_implicit_encoding,
108 errors=_implicit_errors):
109 return tuple(x.decode(encoding, errors) if x else '' for x in args)
110
111def _coerce_args(*args):
112 # Invokes decode if necessary to create str args
113 # and returns the coerced inputs along with
114 # an appropriate result coercion function
115 # - noop for str inputs
116 # - encoding function otherwise
117 str_input = isinstance(args[0], str)
118 for arg in args[1:]:
119 # We special-case the empty string to support the
120 # "scheme=''" default argument to some functions
121 if arg and isinstance(arg, str) != str_input:
122 raise TypeError("Cannot mix str and non-str arguments")
123 if str_input:
124 return args + (_noop,)
125 return _decode_args(args) + (_encode_result,)
126
127# Result objects are more helpful than simple tuples
128class _ResultMixinStr(object):
129 """Standard approach to encoding parsed results from str to bytes"""
130 __slots__ = ()
131
132 def encode(self, encoding='ascii', errors='strict'):
133 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
134
135
136class _ResultMixinBytes(object):
137 """Standard approach to decoding parsed results from bytes to str"""
138 __slots__ = ()
139
140 def decode(self, encoding='ascii', errors='strict'):
141 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
142
143
144class _NetlocResultMixinBase(object):
145 """Shared methods for the parsed result objects containing a netloc element"""
146 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000147
148 @property
149 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000150 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000151
152 @property
153 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000154 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000155
156 @property
157 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000158 hostname = self._hostinfo[0]
159 if not hostname:
Коренберг Маркfbd60512017-12-21 17:16:17 +0500160 return None
161 # Scoped IPv6 address may have zone info, which must not be lowercased
162 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
163 separator = '%' if isinstance(hostname, str) else b'%'
164 hostname, percent, zone = hostname.partition(separator)
165 return hostname.lower() + percent + zone
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000166
167 @property
168 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000169 port = self._hostinfo[1]
170 if port is not None:
Matt Eaton2cb46612018-03-20 01:41:37 -0500171 try:
172 port = int(port, 10)
173 except ValueError:
174 message = f'Port could not be cast to integer value as {port!r}'
175 raise ValueError(message) from None
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800176 if not ( 0 <= port <= 65535):
Robert Collinsdfa95c92015-08-10 09:53:30 +1200177 raise ValueError("Port out of range 0-65535")
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000178 return port
179
Batuhan Taşkaya03615562020-04-10 17:46:36 +0300180 __class_getitem__ = classmethod(types.GenericAlias)
181
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000182
183class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
184 __slots__ = ()
185
186 @property
187 def _userinfo(self):
188 netloc = self.netloc
189 userinfo, have_info, hostinfo = netloc.rpartition('@')
190 if have_info:
191 username, have_password, password = userinfo.partition(':')
192 if not have_password:
193 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000194 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000195 username = password = None
196 return username, password
197
198 @property
199 def _hostinfo(self):
200 netloc = self.netloc
201 _, _, hostinfo = netloc.rpartition('@')
202 _, have_open_br, bracketed = hostinfo.partition('[')
203 if have_open_br:
204 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200205 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000206 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200207 hostname, _, port = hostinfo.partition(':')
208 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000209 port = None
210 return hostname, port
211
212
213class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
214 __slots__ = ()
215
216 @property
217 def _userinfo(self):
218 netloc = self.netloc
219 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
220 if have_info:
221 username, have_password, password = userinfo.partition(b':')
222 if not have_password:
223 password = None
224 else:
225 username = password = None
226 return username, password
227
228 @property
229 def _hostinfo(self):
230 netloc = self.netloc
231 _, _, hostinfo = netloc.rpartition(b'@')
232 _, have_open_br, bracketed = hostinfo.partition(b'[')
233 if have_open_br:
234 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200235 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000236 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200237 hostname, _, port = hostinfo.partition(b':')
238 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000239 port = None
240 return hostname, port
241
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000242
243from collections import namedtuple
244
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000245_DefragResultBase = namedtuple('DefragResult', 'url fragment')
Senthil Kumaran86f71092016-01-14 00:11:39 -0800246_SplitResultBase = namedtuple(
247 'SplitResult', 'scheme netloc path query fragment')
248_ParseResultBase = namedtuple(
249 'ParseResult', 'scheme netloc path params query fragment')
250
251_DefragResultBase.__doc__ = """
252DefragResult(url, fragment)
253
254A 2-tuple that contains the url without fragment identifier and the fragment
255identifier as a separate argument.
256"""
257
258_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
259
260_DefragResultBase.fragment.__doc__ = """
261Fragment identifier separated from URL, that allows indirect identification of a
262secondary resource by reference to a primary resource and additional identifying
263information.
264"""
265
266_SplitResultBase.__doc__ = """
267SplitResult(scheme, netloc, path, query, fragment)
268
269A 5-tuple that contains the different components of a URL. Similar to
270ParseResult, but does not split params.
271"""
272
273_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
274
275_SplitResultBase.netloc.__doc__ = """
276Network location where the request is made to.
277"""
278
279_SplitResultBase.path.__doc__ = """
280The hierarchical path, such as the path to a file to download.
281"""
282
283_SplitResultBase.query.__doc__ = """
284The query component, that contains non-hierarchical data, that along with data
285in path component, identifies a resource in the scope of URI's scheme and
286network location.
287"""
288
289_SplitResultBase.fragment.__doc__ = """
290Fragment identifier, that allows indirect identification of a secondary resource
291by reference to a primary resource and additional identifying information.
292"""
293
294_ParseResultBase.__doc__ = """
Cheryl Sabella0250de42018-04-25 16:51:54 -0700295ParseResult(scheme, netloc, path, params, query, fragment)
Senthil Kumaran86f71092016-01-14 00:11:39 -0800296
297A 6-tuple that contains components of a parsed URL.
298"""
299
300_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
301_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
302_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
303_ParseResultBase.params.__doc__ = """
304Parameters for last path element used to dereference the URI in order to provide
305access to perform some operation on the resource.
306"""
307
308_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
309_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
310
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000311
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000312# For backwards compatibility, alias _NetlocResultMixinStr
313# ResultBase is no longer part of the documented API, but it is
314# retained since deprecating it isn't worth the hassle
315ResultBase = _NetlocResultMixinStr
316
317# Structured result objects for string data
318class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000319 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000320 def geturl(self):
321 if self.fragment:
322 return self.url + '#' + self.fragment
323 else:
324 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000325
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000326class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
327 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000328 def geturl(self):
329 return urlunsplit(self)
330
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000331class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000332 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000333 def geturl(self):
334 return urlunparse(self)
335
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000336# Structured result objects for bytes data
337class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
338 __slots__ = ()
339 def geturl(self):
340 if self.fragment:
341 return self.url + b'#' + self.fragment
342 else:
343 return self.url
344
345class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
346 __slots__ = ()
347 def geturl(self):
348 return urlunsplit(self)
349
350class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
351 __slots__ = ()
352 def geturl(self):
353 return urlunparse(self)
354
355# Set up the encode/decode result pairs
356def _fix_result_transcoding():
357 _result_pairs = (
358 (DefragResult, DefragResultBytes),
359 (SplitResult, SplitResultBytes),
360 (ParseResult, ParseResultBytes),
361 )
362 for _decoded, _encoded in _result_pairs:
363 _decoded._encoded_counterpart = _encoded
364 _encoded._decoded_counterpart = _decoded
365
366_fix_result_transcoding()
367del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000368
369def urlparse(url, scheme='', allow_fragments=True):
370 """Parse a URL into 6 components:
371 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
idomicc33bdbb2020-02-16 14:17:58 -0500372
373 The result is a named 6-tuple with fields corresponding to the
374 above. It is either a ParseResult or ParseResultBytes object,
375 depending on the type of the url parameter.
376
377 The username, password, hostname, and port sub-components of netloc
378 can also be accessed as attributes of the returned object.
379
380 The scheme argument provides the default value of the scheme
381 component when no scheme is found in url.
382
383 If allow_fragments is False, no attempt is made to separate the
384 fragment component from the previous component, which can be either
385 path or query.
386
387 Note that % escapes are not expanded.
388 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000389 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700390 splitresult = urlsplit(url, scheme, allow_fragments)
391 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000392 if scheme in uses_params and ';' in url:
393 url, params = _splitparams(url)
394 else:
395 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000396 result = ParseResult(scheme, netloc, url, params, query, fragment)
397 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000398
399def _splitparams(url):
400 if '/' in url:
401 i = url.find(';', url.rfind('/'))
402 if i < 0:
403 return url, ''
404 else:
405 i = url.find(';')
406 return url[:i], url[i+1:]
407
408def _splitnetloc(url, start=0):
409 delim = len(url) # position of end of domain part of url, default is end
410 for c in '/?#': # look for delimiters; the order is NOT important
411 wdelim = url.find(c, start) # find first of this delim
412 if wdelim >= 0: # if found
413 delim = min(delim, wdelim) # use earliest delim position
414 return url[start:delim], url[delim:] # return (domain, rest)
415
Steve Dower16e6f7d2019-03-07 08:02:26 -0800416def _checknetloc(netloc):
417 if not netloc or netloc.isascii():
418 return
419 # looking for characters like \u2100 that expand to 'a/c'
420 # IDNA uses NFKC equivalence, so normalize for this check
421 import unicodedata
Steve Dower8d0ef0b2019-06-04 08:55:30 -0700422 n = netloc.replace('@', '') # ignore characters already included
423 n = n.replace(':', '') # but not the surrounding text
424 n = n.replace('#', '')
Steve Dowerd537ab02019-04-30 12:03:02 +0000425 n = n.replace('?', '')
426 netloc2 = unicodedata.normalize('NFKC', n)
427 if n == netloc2:
Steve Dower16e6f7d2019-03-07 08:02:26 -0800428 return
Steve Dower16e6f7d2019-03-07 08:02:26 -0800429 for c in '/?#@:':
430 if c in netloc2:
Steve Dowerd537ab02019-04-30 12:03:02 +0000431 raise ValueError("netloc '" + netloc + "' contains invalid " +
Steve Dower16e6f7d2019-03-07 08:02:26 -0800432 "characters under NFKC normalization")
433
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000434def urlsplit(url, scheme='', allow_fragments=True):
435 """Parse a URL into 5 components:
436 <scheme>://<netloc>/<path>?<query>#<fragment>
idomicc33bdbb2020-02-16 14:17:58 -0500437
438 The result is a named 5-tuple with fields corresponding to the
439 above. It is either a SplitResult or SplitResultBytes object,
440 depending on the type of the url parameter.
441
442 The username, password, hostname, and port sub-components of netloc
443 can also be accessed as attributes of the returned object.
444
445 The scheme argument provides the default value of the scheme
446 component when no scheme is found in url.
447
448 If allow_fragments is False, no attempt is made to separate the
449 fragment component from the previous component, which can be either
450 path or query.
451
452 Note that % escapes are not expanded.
453 """
454
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000455 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000456 allow_fragments = bool(allow_fragments)
457 key = url, scheme, allow_fragments, type(url), type(scheme)
458 cached = _parse_cache.get(key, None)
459 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000460 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000461 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
462 clear_cache()
463 netloc = query = fragment = ''
464 i = url.find(':')
465 if i > 0:
Senthil Kumaran397eb442011-04-15 18:20:24 +0800466 for c in url[:i]:
467 if c not in scheme_chars:
468 break
469 else:
Tim Graham5a88d502019-10-18 09:07:20 -0400470 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran397eb442011-04-15 18:20:24 +0800471
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000472 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000473 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000474 if (('[' in netloc and ']' not in netloc) or
475 (']' in netloc and '[' not in netloc)):
476 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800477 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000478 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800479 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000480 url, query = url.split('?', 1)
Steve Dower16e6f7d2019-03-07 08:02:26 -0800481 _checknetloc(netloc)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000482 v = SplitResult(scheme, netloc, url, query, fragment)
483 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000484 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000485
486def urlunparse(components):
487 """Put a parsed URL back together again. This may result in a
488 slightly different, but equivalent URL, if the URL that was parsed
489 originally had redundant delimiters, e.g. a ? with an empty query
490 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000491 scheme, netloc, url, params, query, fragment, _coerce_result = (
492 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000493 if params:
494 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000495 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000496
497def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000498 """Combine the elements of a tuple as returned by urlsplit() into a
499 complete URL as a string. The data argument can be any five-item iterable.
500 This may result in a slightly different, but equivalent URL, if the URL that
501 was parsed originally had unnecessary delimiters (for example, a ? with an
502 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000503 scheme, netloc, url, query, fragment, _coerce_result = (
504 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000505 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
506 if url and url[:1] != '/': url = '/' + url
507 url = '//' + (netloc or '') + url
508 if scheme:
509 url = scheme + ':' + url
510 if query:
511 url = url + '?' + query
512 if fragment:
513 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000514 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000515
516def urljoin(base, url, allow_fragments=True):
517 """Join a base URL and a possibly relative URL to form an absolute
518 interpretation of the latter."""
519 if not base:
520 return url
521 if not url:
522 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400523
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000524 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000525 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
526 urlparse(base, '', allow_fragments)
527 scheme, netloc, path, params, query, fragment = \
528 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400529
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000531 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000532 if scheme in uses_netloc:
533 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000534 return _coerce_result(urlunparse((scheme, netloc, path,
535 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000536 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400537
Senthil Kumarandca5b862010-12-17 04:48:45 +0000538 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000539 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000540 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000541 if not query:
542 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000543 return _coerce_result(urlunparse((scheme, netloc, path,
544 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400545
546 base_parts = bpath.split('/')
547 if base_parts[-1] != '':
548 # the last item is not a directory, so will not be taken into account
549 # in resolving the relative path
550 del base_parts[-1]
551
552 # for rfc3986, ignore all base path should the first character be root.
553 if path[:1] == '/':
554 segments = path.split('/')
555 else:
556 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800557 # filter out elements that would cause redundant slashes on re-joining
558 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300559 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400560
561 resolved_path = []
562
563 for seg in segments:
564 if seg == '..':
565 try:
566 resolved_path.pop()
567 except IndexError:
568 # ignore any .. segments that would otherwise cause an IndexError
569 # when popped from resolved_path if resolving for rfc3986
570 pass
571 elif seg == '.':
572 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000573 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400574 resolved_path.append(seg)
575
576 if segments[-1] in ('.', '..'):
577 # do some post-processing here. if the last segment was a relative dir,
578 # then we need to append the trailing '/'
579 resolved_path.append('')
580
581 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800582 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400583
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000584
585def urldefrag(url):
586 """Removes any existing fragment from URL.
587
588 Returns a tuple of the defragmented URL and the fragment. If
589 the URL contained no fragments, the second element is the
590 empty string.
591 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000592 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000593 if '#' in url:
594 s, n, p, a, q, frag = urlparse(url)
595 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000596 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000597 frag = ''
598 defrag = url
599 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000600
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200601_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100602_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200603
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000604def unquote_to_bytes(string):
605 """unquote_to_bytes('abc%20def') -> b'abc def'."""
606 # Note: strings are encoded as UTF-8. This is only an issue if it contains
607 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000608 if not string:
609 # Is it a string-like object?
610 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000611 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000612 if isinstance(string, str):
613 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200614 bits = string.split(b'%')
615 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000616 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200617 res = [bits[0]]
618 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100619 # Delay the initialization of the table to not waste memory
620 # if the function is never called
621 global _hextobyte
622 if _hextobyte is None:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +0200623 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100624 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200625 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000626 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200627 append(_hextobyte[item[:2]])
628 append(item[2:])
629 except KeyError:
630 append(b'%')
631 append(item)
632 return b''.join(res)
633
634_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000635
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000636def unquote(string, encoding='utf-8', errors='replace'):
637 """Replace %xx escapes by their single-character equivalent. The optional
638 encoding and errors parameters specify how to decode percent-encoded
639 sequences into Unicode characters, as accepted by the bytes.decode()
640 method.
641 By default, percent-encoded sequences are decoded with UTF-8, and invalid
642 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000643
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000644 unquote('abc%20def') -> 'abc def'.
645 """
Stein Karlsenaad2ee02019-10-14 12:36:29 +0200646 if isinstance(string, bytes):
647 return unquote_to_bytes(string).decode(encoding, errors)
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200648 if '%' not in string:
649 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000650 return string
651 if encoding is None:
652 encoding = 'utf-8'
653 if errors is None:
654 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200655 bits = _asciire.split(string)
656 res = [bits[0]]
657 append = res.append
658 for i in range(1, len(bits), 2):
659 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
660 append(bits[i + 1])
661 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000662
Senthil Kumaran257b9802017-04-04 21:19:43 -0700663
Victor Stinnerac71c542011-01-14 12:52:12 +0000664def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
matthewbelisle-wf20914482018-10-19 05:52:59 -0500665 encoding='utf-8', errors='replace', max_num_fields=None):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000666 """Parse a query given as a string argument.
667
668 Arguments:
669
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000670 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000671
672 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000673 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000674 A true value indicates that blanks should be retained as
675 blank strings. The default false value indicates that
676 blank values are to be ignored and treated as if they were
677 not included.
678
679 strict_parsing: flag indicating what to do with parsing errors.
680 If false (the default), errors are silently ignored.
681 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000682
683 encoding and errors: specify how to decode percent-encoded sequences
684 into Unicode characters, as accepted by the bytes.decode() method.
Senthil Kumaran257b9802017-04-04 21:19:43 -0700685
matthewbelisle-wf20914482018-10-19 05:52:59 -0500686 max_num_fields: int. If set, then throws a ValueError if there
687 are more than n fields read by parse_qsl().
688
Senthil Kumaran257b9802017-04-04 21:19:43 -0700689 Returns a dictionary.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000690 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700691 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000692 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
matthewbelisle-wf20914482018-10-19 05:52:59 -0500693 encoding=encoding, errors=errors,
694 max_num_fields=max_num_fields)
Victor Stinnerac71c542011-01-14 12:52:12 +0000695 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700696 if name in parsed_result:
697 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000698 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700699 parsed_result[name] = [value]
700 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000701
Senthil Kumaran257b9802017-04-04 21:19:43 -0700702
Victor Stinnerac71c542011-01-14 12:52:12 +0000703def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
matthewbelisle-wf20914482018-10-19 05:52:59 -0500704 encoding='utf-8', errors='replace', max_num_fields=None):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000705 """Parse a query given as a string argument.
706
Senthil Kumaran257b9802017-04-04 21:19:43 -0700707 Arguments:
Facundo Batistac469d4c2008-09-03 22:49:01 +0000708
Senthil Kumaran257b9802017-04-04 21:19:43 -0700709 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000710
Senthil Kumaran257b9802017-04-04 21:19:43 -0700711 keep_blank_values: flag indicating whether blank values in
712 percent-encoded queries should be treated as blank strings.
713 A true value indicates that blanks should be retained as blank
714 strings. The default false value indicates that blank values
715 are to be ignored and treated as if they were not included.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000716
Senthil Kumaran257b9802017-04-04 21:19:43 -0700717 strict_parsing: flag indicating what to do with parsing errors. If
718 false (the default), errors are silently ignored. If true,
719 errors raise a ValueError exception.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000720
Senthil Kumaran257b9802017-04-04 21:19:43 -0700721 encoding and errors: specify how to decode percent-encoded sequences
722 into Unicode characters, as accepted by the bytes.decode() method.
Victor Stinnerac71c542011-01-14 12:52:12 +0000723
matthewbelisle-wf20914482018-10-19 05:52:59 -0500724 max_num_fields: int. If set, then throws a ValueError
725 if there are more than n fields read by parse_qsl().
726
Senthil Kumaran257b9802017-04-04 21:19:43 -0700727 Returns a list, as G-d intended.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000728 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000729 qs, _coerce_result = _coerce_args(qs)
matthewbelisle-wf20914482018-10-19 05:52:59 -0500730
731 # If max_num_fields is defined then check that the number of fields
732 # is less than max_num_fields. This prevents a memory exhaustion DOS
733 # attack via post bodies with many fields.
734 if max_num_fields is not None:
735 num_fields = 1 + qs.count('&') + qs.count(';')
736 if max_num_fields < num_fields:
737 raise ValueError('Max number of fields exceeded')
738
Facundo Batistac469d4c2008-09-03 22:49:01 +0000739 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
740 r = []
741 for name_value in pairs:
742 if not name_value and not strict_parsing:
743 continue
744 nv = name_value.split('=', 1)
745 if len(nv) != 2:
746 if strict_parsing:
747 raise ValueError("bad query field: %r" % (name_value,))
748 # Handle case of a control-name with no equal sign
749 if keep_blank_values:
750 nv.append('')
751 else:
752 continue
753 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000754 name = nv[0].replace('+', ' ')
755 name = unquote(name, encoding=encoding, errors=errors)
756 name = _coerce_result(name)
757 value = nv[1].replace('+', ' ')
758 value = unquote(value, encoding=encoding, errors=errors)
759 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000760 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000761 return r
762
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000763def unquote_plus(string, encoding='utf-8', errors='replace'):
764 """Like unquote(), but also replace plus signs by spaces, as required for
765 unquoting HTML form values.
766
767 unquote_plus('%7e/abc+def') -> '~/abc def'
768 """
769 string = string.replace('+', ' ')
770 return unquote(string, encoding, errors)
771
772_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
773 b'abcdefghijklmnopqrstuvwxyz'
774 b'0123456789'
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530775 b'_.-~')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000776_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
777_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000778
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000779class Quoter(collections.defaultdict):
780 """A mapping from bytes (in range(0,256)) to strings.
781
782 String values are percent-encoded byte values, unless the key < 128, and
783 in the "safe" set (either the specified safe set, or default set).
784 """
785 # Keeps a cache internally, using defaultdict, for efficiency (lookups
786 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000787 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000788 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000789 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000790
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000791 def __repr__(self):
792 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300793 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000794
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000795 def __missing__(self, b):
796 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000797 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000798 self[b] = res
799 return res
800
801def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000802 """quote('abc def') -> 'abc%20def'
803
804 Each part of a URL, e.g. the path info, the query, etc., has a
Jörn Hees750d74f2019-04-10 02:31:18 +0200805 different set of reserved characters that must be quoted. The
806 quote function offers a cautious (not minimal) way to quote a
807 string for most of these parts.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000808
Jörn Hees750d74f2019-04-10 02:31:18 +0200809 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
810 the following (un)reserved characters.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000811
Jörn Hees750d74f2019-04-10 02:31:18 +0200812 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
813 reserved = gen-delims / sub-delims
814 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
815 sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
816 / "*" / "+" / "," / ";" / "="
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000817
Jörn Hees750d74f2019-04-10 02:31:18 +0200818 Each of the reserved characters is reserved in some component of a URL,
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000819 but not necessarily in all of them.
820
Jörn Hees750d74f2019-04-10 02:31:18 +0200821 The quote function %-escapes all characters that are neither in the
822 unreserved chars ("always safe") nor the additional chars set via the
823 safe arg.
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530824
Jörn Hees750d74f2019-04-10 02:31:18 +0200825 The default for the safe arg is '/'. The character is reserved, but in
826 typical usage the quote function is being called on a path where the
827 existing slash characters are to be preserved.
828
829 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
830 Now, "~" is included in the set of unreserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000831
R David Murray8c4e1122014-12-24 21:23:18 -0500832 string and safe may be either str or bytes objects. encoding and errors
833 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000834
835 The optional encoding and errors parameters specify how to deal with
836 non-ASCII characters, as accepted by the str.encode method.
837 By default, encoding='utf-8' (characters are encoded with UTF-8), and
838 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000839 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000840 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000841 if not string:
842 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000843 if encoding is None:
844 encoding = 'utf-8'
845 if errors is None:
846 errors = 'strict'
847 string = string.encode(encoding, errors)
848 else:
849 if encoding is not None:
850 raise TypeError("quote() doesn't support 'encoding' for bytes")
851 if errors is not None:
852 raise TypeError("quote() doesn't support 'errors' for bytes")
853 return quote_from_bytes(string, safe)
854
855def quote_plus(string, safe='', encoding=None, errors=None):
856 """Like quote(), but also replace ' ' with '+', as required for quoting
857 HTML form values. Plus signs in the original string are escaped unless
858 they are included in safe. It also does not have safe default to '/'.
859 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000860 # Check if ' ' in string, where string may either be a str or bytes. If
861 # there are no spaces, the regular quote will produce the right answer.
862 if ((isinstance(string, str) and ' ' not in string) or
863 (isinstance(string, bytes) and b' ' not in string)):
864 return quote(string, safe, encoding, errors)
865 if isinstance(safe, str):
866 space = ' '
867 else:
868 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000869 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000870 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000871
872def quote_from_bytes(bs, safe='/'):
873 """Like quote(), but accepts a bytes object rather than a str, and does
874 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800875 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000876 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000877 if not isinstance(bs, (bytes, bytearray)):
878 raise TypeError("quote_from_bytes() expected bytes")
879 if not bs:
880 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000881 if isinstance(safe, str):
882 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
883 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000884 else:
885 safe = bytes([c for c in safe if c < 128])
886 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
887 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000888 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000889 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000890 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000891 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
892 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000893
R David Murrayc17686f2015-05-17 20:44:50 -0400894def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
895 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700896 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000897
898 If any values in the query arg are sequences and doseq is true, each
899 sequence element is converted to a separate parameter.
900
901 If the query arg is a sequence of two-element tuples, the order of the
902 parameters in the output will match the order of parameters in the
903 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000904
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700905 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500906
R David Murrayc17686f2015-05-17 20:44:50 -0400907 The safe, encoding, and errors parameters are passed down to the function
908 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000909 """
910
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000911 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000912 query = query.items()
913 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000914 # It's a bother at times that strings and string-like objects are
915 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000916 try:
917 # non-sequence items should not work with len()
918 # non-empty strings will fail this
919 if len(query) and not isinstance(query[0], tuple):
920 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000921 # Zero-length sequences of all types will get here and succeed,
922 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000923 # allowed empty dicts that type of behavior probably should be
924 # preserved for consistency
925 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000926 ty, va, tb = sys.exc_info()
927 raise TypeError("not a valid non-string sequence "
928 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000929
930 l = []
931 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000932 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000933 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400934 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000935 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400936 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000937
938 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400939 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000940 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400941 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000942 l.append(k + '=' + v)
943 else:
944 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000945 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400946 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000947 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400948 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000949
950 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400951 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000952 l.append(k + '=' + v)
953 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400954 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000955 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000956 else:
957 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000958 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000959 x = len(v)
960 except TypeError:
961 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400962 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000963 l.append(k + '=' + v)
964 else:
965 # loop over the sequence
966 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000967 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400968 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000969 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400970 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000971 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000972 return '&'.join(l)
973
Cheryl Sabella0250de42018-04-25 16:51:54 -0700974
Georg Brandl13e89462008-07-01 19:56:00 +0000975def to_bytes(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -0700976 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
977 DeprecationWarning, stacklevel=2)
978 return _to_bytes(url)
979
980
981def _to_bytes(url):
Georg Brandl13e89462008-07-01 19:56:00 +0000982 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000983 # Most URL schemes require ASCII. If that changes, the conversion
984 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000985 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000986 if isinstance(url, str):
987 try:
988 url = url.encode("ASCII").decode()
989 except UnicodeError:
990 raise UnicodeError("URL " + repr(url) +
991 " contains non-ASCII characters")
992 return url
993
Cheryl Sabella0250de42018-04-25 16:51:54 -0700994
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000995def unwrap(url):
Rémi Lapeyre674ee122019-05-27 15:43:45 +0200996 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
Cheryl Sabella0250de42018-04-25 16:51:54 -0700997
Rémi Lapeyre674ee122019-05-27 15:43:45 +0200998 The string is returned unchanged if it's not a wrapped URL.
999 """
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001000 url = str(url).strip()
1001 if url[:1] == '<' and url[-1:] == '>':
1002 url = url[1:-1].strip()
Rémi Lapeyre674ee122019-05-27 15:43:45 +02001003 if url[:4] == 'URL:':
1004 url = url[4:].strip()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001005 return url
1006
Cheryl Sabella0250de42018-04-25 16:51:54 -07001007
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001008def splittype(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001009 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
1010 "use urllib.parse.urlparse() instead",
1011 DeprecationWarning, stacklevel=2)
1012 return _splittype(url)
1013
1014
1015_typeprog = None
1016def _splittype(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001017 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1018 global _typeprog
1019 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001020 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001021
1022 match = _typeprog.match(url)
1023 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001024 scheme, data = match.groups()
1025 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001026 return None, url
1027
Cheryl Sabella0250de42018-04-25 16:51:54 -07001028
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001029def splithost(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001030 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1031 "use urllib.parse.urlparse() instead",
1032 DeprecationWarning, stacklevel=2)
1033 return _splithost(url)
1034
1035
1036_hostprog = None
1037def _splithost(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001038 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1039 global _hostprog
1040 if _hostprog is None:
postmasters90e01e52017-06-20 06:02:44 -07001041 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001042
1043 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +00001044 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001045 host_port, path = match.groups()
1046 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +00001047 path = '/' + path
1048 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001049 return None, url
1050
Cheryl Sabella0250de42018-04-25 16:51:54 -07001051
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001052def splituser(host):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001053 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1054 "use urllib.parse.urlparse() instead",
1055 DeprecationWarning, stacklevel=2)
1056 return _splituser(host)
1057
1058
1059def _splituser(host):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001060 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001061 user, delim, host = host.rpartition('@')
1062 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001063
Cheryl Sabella0250de42018-04-25 16:51:54 -07001064
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001065def splitpasswd(user):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001066 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1067 "use urllib.parse.urlparse() instead",
1068 DeprecationWarning, stacklevel=2)
1069 return _splitpasswd(user)
1070
1071
1072def _splitpasswd(user):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001073 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001074 user, delim, passwd = user.partition(':')
1075 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001076
Cheryl Sabella0250de42018-04-25 16:51:54 -07001077
1078def splitport(host):
1079 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1080 "use urllib.parse.urlparse() instead",
1081 DeprecationWarning, stacklevel=2)
1082 return _splitport(host)
1083
1084
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001085# splittag('/path#tag') --> '/path', 'tag'
1086_portprog = None
Cheryl Sabella0250de42018-04-25 16:51:54 -07001087def _splitport(host):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001088 """splitport('host:port') --> 'host', 'port'."""
1089 global _portprog
1090 if _portprog is None:
Serhiy Storchaka6a265f02020-01-05 14:14:31 +02001091 _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001092
Serhiy Storchaka6a265f02020-01-05 14:14:31 +02001093 match = _portprog.fullmatch(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +02001094 if match:
1095 host, port = match.groups()
1096 if port:
1097 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001098 return host, None
1099
Cheryl Sabella0250de42018-04-25 16:51:54 -07001100
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001101def splitnport(host, defport=-1):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001102 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1103 "use urllib.parse.urlparse() instead",
1104 DeprecationWarning, stacklevel=2)
1105 return _splitnport(host, defport)
1106
1107
1108def _splitnport(host, defport=-1):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001109 """Split host and port, returning numeric port.
1110 Return given default port if no ':' found; defaults to -1.
1111 Return numerical port if a valid number are found after ':'.
1112 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001113 host, delim, port = host.rpartition(':')
1114 if not delim:
1115 host = port
1116 elif port:
1117 try:
1118 nport = int(port)
1119 except ValueError:
1120 nport = None
1121 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001122 return host, defport
1123
Cheryl Sabella0250de42018-04-25 16:51:54 -07001124
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001125def splitquery(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001126 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1127 "use urllib.parse.urlparse() instead",
1128 DeprecationWarning, stacklevel=2)
1129 return _splitquery(url)
1130
1131
1132def _splitquery(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001133 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001134 path, delim, query = url.rpartition('?')
1135 if delim:
1136 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001137 return url, None
1138
Cheryl Sabella0250de42018-04-25 16:51:54 -07001139
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001140def splittag(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001141 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1142 "use urllib.parse.urlparse() instead",
1143 DeprecationWarning, stacklevel=2)
1144 return _splittag(url)
1145
1146
1147def _splittag(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001148 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001149 path, delim, tag = url.rpartition('#')
1150 if delim:
1151 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001152 return url, None
1153
Cheryl Sabella0250de42018-04-25 16:51:54 -07001154
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001155def splitattr(url):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001156 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1157 "use urllib.parse.urlparse() instead",
1158 DeprecationWarning, stacklevel=2)
1159 return _splitattr(url)
1160
1161
1162def _splitattr(url):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001163 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1164 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1165 words = url.split(';')
1166 return words[0], words[1:]
1167
Cheryl Sabella0250de42018-04-25 16:51:54 -07001168
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001169def splitvalue(attr):
Cheryl Sabella0250de42018-04-25 16:51:54 -07001170 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1171 "use urllib.parse.parse_qsl() instead",
1172 DeprecationWarning, stacklevel=2)
1173 return _splitvalue(attr)
1174
1175
1176def _splitvalue(attr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001177 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001178 attr, delim, value = attr.partition('=')
1179 return attr, (value if delim else None)