blob: f5b3487ea9d6083496628cccf21b00ef0eaf42ad [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000033
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000035 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030037 "unquote", "unquote_plus", "unquote_to_bytes",
38 "DefragResult", "ParseResult", "SplitResult",
39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000040
Senthil Kumaran906f5332017-05-17 21:48:59 -070041# A classification of schemes.
42# The empty string classifies URLs with no scheme specified,
43# being the default value returned by “urlsplit” and “urlparse”.
44
45uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran906f5332017-05-17 21:48:59 -070047 'prospero', 'rtsp', 'rtspu', 'sftp',
Berker Peksagf6767482016-09-16 14:43:58 +030048 'svn', 'svn+ssh', 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070049
50uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
Senthil Kumaran906f5332017-05-17 21:48:59 -070052 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
Berker Peksagf6767482016-09-16 14:43:58 +030053 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
54 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070055
56uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000057 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaran906f5332017-05-17 21:48:59 -070058 'mms', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
Georg Brandla61b09f2012-08-24 18:15:29 +020060# These are not actually used anymore, but should stay for backwards
61# compatibility. (They are undocumented, but have a public-looking name.)
Senthil Kumaran906f5332017-05-17 21:48:59 -070062
Georg Brandla61b09f2012-08-24 18:15:29 +020063non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
64 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Senthil Kumaran906f5332017-05-17 21:48:59 -070065
66uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
67 'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
68
69uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandla61b09f2012-08-24 18:15:29 +020070 'nntp', 'wais', 'https', 'shttp', 'snews',
Senthil Kumaran906f5332017-05-17 21:48:59 -070071 'file', 'prospero']
Georg Brandla61b09f2012-08-24 18:15:29 +020072
Jeremy Hylton1afc1692008-06-18 20:49:58 +000073# Characters valid in scheme names
74scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
75 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
76 '0123456789'
77 '+-.')
78
Nick Coghlan9fc443c2010-11-30 15:48:08 +000079# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000080MAX_CACHE_SIZE = 20
81_parse_cache = {}
82
83def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000084 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000085 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000086 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000087
88
Nick Coghlan9fc443c2010-11-30 15:48:08 +000089# Helpers for bytes handling
90# For 3.2, we deliberately require applications that
91# handle improperly quoted URLs to do their own
92# decoding and encoding. If valid use cases are
93# presented, we may relax this by using latin-1
94# decoding internally for 3.3
95_implicit_encoding = 'ascii'
96_implicit_errors = 'strict'
97
98def _noop(obj):
99 return obj
100
101def _encode_result(obj, encoding=_implicit_encoding,
102 errors=_implicit_errors):
103 return obj.encode(encoding, errors)
104
105def _decode_args(args, encoding=_implicit_encoding,
106 errors=_implicit_errors):
107 return tuple(x.decode(encoding, errors) if x else '' for x in args)
108
109def _coerce_args(*args):
110 # Invokes decode if necessary to create str args
111 # and returns the coerced inputs along with
112 # an appropriate result coercion function
113 # - noop for str inputs
114 # - encoding function otherwise
115 str_input = isinstance(args[0], str)
116 for arg in args[1:]:
117 # We special-case the empty string to support the
118 # "scheme=''" default argument to some functions
119 if arg and isinstance(arg, str) != str_input:
120 raise TypeError("Cannot mix str and non-str arguments")
121 if str_input:
122 return args + (_noop,)
123 return _decode_args(args) + (_encode_result,)
124
125# Result objects are more helpful than simple tuples
126class _ResultMixinStr(object):
127 """Standard approach to encoding parsed results from str to bytes"""
128 __slots__ = ()
129
130 def encode(self, encoding='ascii', errors='strict'):
131 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
132
133
134class _ResultMixinBytes(object):
135 """Standard approach to decoding parsed results from bytes to str"""
136 __slots__ = ()
137
138 def decode(self, encoding='ascii', errors='strict'):
139 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
140
141
142class _NetlocResultMixinBase(object):
143 """Shared methods for the parsed result objects containing a netloc element"""
144 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000145
146 @property
147 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000148 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000149
150 @property
151 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000152 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000153
154 @property
155 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000156 hostname = self._hostinfo[0]
157 if not hostname:
Коренберг Маркfbd60512017-12-21 17:16:17 +0500158 return None
159 # Scoped IPv6 address may have zone info, which must not be lowercased
160 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
161 separator = '%' if isinstance(hostname, str) else b'%'
162 hostname, percent, zone = hostname.partition(separator)
163 return hostname.lower() + percent + zone
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000164
165 @property
166 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000167 port = self._hostinfo[1]
168 if port is not None:
169 port = int(port, 10)
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800170 if not ( 0 <= port <= 65535):
Robert Collinsdfa95c92015-08-10 09:53:30 +1200171 raise ValueError("Port out of range 0-65535")
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000172 return port
173
174
175class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
176 __slots__ = ()
177
178 @property
179 def _userinfo(self):
180 netloc = self.netloc
181 userinfo, have_info, hostinfo = netloc.rpartition('@')
182 if have_info:
183 username, have_password, password = userinfo.partition(':')
184 if not have_password:
185 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000186 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000187 username = password = None
188 return username, password
189
190 @property
191 def _hostinfo(self):
192 netloc = self.netloc
193 _, _, hostinfo = netloc.rpartition('@')
194 _, have_open_br, bracketed = hostinfo.partition('[')
195 if have_open_br:
196 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200197 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000198 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200199 hostname, _, port = hostinfo.partition(':')
200 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000201 port = None
202 return hostname, port
203
204
205class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
206 __slots__ = ()
207
208 @property
209 def _userinfo(self):
210 netloc = self.netloc
211 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
212 if have_info:
213 username, have_password, password = userinfo.partition(b':')
214 if not have_password:
215 password = None
216 else:
217 username = password = None
218 return username, password
219
220 @property
221 def _hostinfo(self):
222 netloc = self.netloc
223 _, _, hostinfo = netloc.rpartition(b'@')
224 _, have_open_br, bracketed = hostinfo.partition(b'[')
225 if have_open_br:
226 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200227 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000228 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200229 hostname, _, port = hostinfo.partition(b':')
230 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000231 port = None
232 return hostname, port
233
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000234
235from collections import namedtuple
236
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000237_DefragResultBase = namedtuple('DefragResult', 'url fragment')
Senthil Kumaran86f71092016-01-14 00:11:39 -0800238_SplitResultBase = namedtuple(
239 'SplitResult', 'scheme netloc path query fragment')
240_ParseResultBase = namedtuple(
241 'ParseResult', 'scheme netloc path params query fragment')
242
243_DefragResultBase.__doc__ = """
244DefragResult(url, fragment)
245
246A 2-tuple that contains the url without fragment identifier and the fragment
247identifier as a separate argument.
248"""
249
250_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
251
252_DefragResultBase.fragment.__doc__ = """
253Fragment identifier separated from URL, that allows indirect identification of a
254secondary resource by reference to a primary resource and additional identifying
255information.
256"""
257
258_SplitResultBase.__doc__ = """
259SplitResult(scheme, netloc, path, query, fragment)
260
261A 5-tuple that contains the different components of a URL. Similar to
262ParseResult, but does not split params.
263"""
264
265_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
266
267_SplitResultBase.netloc.__doc__ = """
268Network location where the request is made to.
269"""
270
271_SplitResultBase.path.__doc__ = """
272The hierarchical path, such as the path to a file to download.
273"""
274
275_SplitResultBase.query.__doc__ = """
276The query component, that contains non-hierarchical data, that along with data
277in path component, identifies a resource in the scope of URI's scheme and
278network location.
279"""
280
281_SplitResultBase.fragment.__doc__ = """
282Fragment identifier, that allows indirect identification of a secondary resource
283by reference to a primary resource and additional identifying information.
284"""
285
286_ParseResultBase.__doc__ = """
287ParseResult(scheme, netloc, path, params, query, fragment)
288
289A 6-tuple that contains components of a parsed URL.
290"""
291
292_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
293_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
294_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
295_ParseResultBase.params.__doc__ = """
296Parameters for last path element used to dereference the URI in order to provide
297access to perform some operation on the resource.
298"""
299
300_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
301_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
302
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000303
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000304# For backwards compatibility, alias _NetlocResultMixinStr
305# ResultBase is no longer part of the documented API, but it is
306# retained since deprecating it isn't worth the hassle
307ResultBase = _NetlocResultMixinStr
308
309# Structured result objects for string data
310class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000311 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000312 def geturl(self):
313 if self.fragment:
314 return self.url + '#' + self.fragment
315 else:
316 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000317
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000318class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
319 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000320 def geturl(self):
321 return urlunsplit(self)
322
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000323class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000324 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000325 def geturl(self):
326 return urlunparse(self)
327
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000328# Structured result objects for bytes data
329class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
330 __slots__ = ()
331 def geturl(self):
332 if self.fragment:
333 return self.url + b'#' + self.fragment
334 else:
335 return self.url
336
337class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
338 __slots__ = ()
339 def geturl(self):
340 return urlunsplit(self)
341
342class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
343 __slots__ = ()
344 def geturl(self):
345 return urlunparse(self)
346
347# Set up the encode/decode result pairs
348def _fix_result_transcoding():
349 _result_pairs = (
350 (DefragResult, DefragResultBytes),
351 (SplitResult, SplitResultBytes),
352 (ParseResult, ParseResultBytes),
353 )
354 for _decoded, _encoded in _result_pairs:
355 _decoded._encoded_counterpart = _encoded
356 _encoded._decoded_counterpart = _decoded
357
358_fix_result_transcoding()
359del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000360
361def urlparse(url, scheme='', allow_fragments=True):
362 """Parse a URL into 6 components:
363 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
364 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
365 Note that we don't break the components up in smaller bits
366 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000367 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700368 splitresult = urlsplit(url, scheme, allow_fragments)
369 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000370 if scheme in uses_params and ';' in url:
371 url, params = _splitparams(url)
372 else:
373 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000374 result = ParseResult(scheme, netloc, url, params, query, fragment)
375 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000376
377def _splitparams(url):
378 if '/' in url:
379 i = url.find(';', url.rfind('/'))
380 if i < 0:
381 return url, ''
382 else:
383 i = url.find(';')
384 return url[:i], url[i+1:]
385
386def _splitnetloc(url, start=0):
387 delim = len(url) # position of end of domain part of url, default is end
388 for c in '/?#': # look for delimiters; the order is NOT important
389 wdelim = url.find(c, start) # find first of this delim
390 if wdelim >= 0: # if found
391 delim = min(delim, wdelim) # use earliest delim position
392 return url[start:delim], url[delim:] # return (domain, rest)
393
Steve Dowerdaad2c42019-03-07 09:08:18 -0800394def _checknetloc(netloc):
395 if not netloc or netloc.isascii():
396 return
397 # looking for characters like \u2100 that expand to 'a/c'
398 # IDNA uses NFKC equivalence, so normalize for this check
399 import unicodedata
Miss Islington (bot)4d723e72019-04-30 05:21:02 -0700400 n = netloc.rpartition('@')[2] # ignore anything to the left of '@'
401 n = n.replace(':', '') # ignore characters already included
402 n = n.replace('#', '') # but not the surrounding text
403 n = n.replace('?', '')
404 netloc2 = unicodedata.normalize('NFKC', n)
405 if n == netloc2:
Steve Dowerdaad2c42019-03-07 09:08:18 -0800406 return
Steve Dowerdaad2c42019-03-07 09:08:18 -0800407 for c in '/?#@:':
408 if c in netloc2:
Miss Islington (bot)4d723e72019-04-30 05:21:02 -0700409 raise ValueError("netloc '" + netloc + "' contains invalid " +
Steve Dowerdaad2c42019-03-07 09:08:18 -0800410 "characters under NFKC normalization")
411
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000412def urlsplit(url, scheme='', allow_fragments=True):
413 """Parse a URL into 5 components:
414 <scheme>://<netloc>/<path>?<query>#<fragment>
415 Return a 5-tuple: (scheme, netloc, path, query, fragment).
416 Note that we don't break the components up in smaller bits
417 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000418 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000419 allow_fragments = bool(allow_fragments)
420 key = url, scheme, allow_fragments, type(url), type(scheme)
421 cached = _parse_cache.get(key, None)
422 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000423 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000424 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
425 clear_cache()
426 netloc = query = fragment = ''
427 i = url.find(':')
428 if i > 0:
429 if url[:i] == 'http': # optimize the common case
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000430 url = url[i+1:]
431 if url[:2] == '//':
432 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000433 if (('[' in netloc and ']' not in netloc) or
434 (']' in netloc and '[' not in netloc)):
435 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000436 if allow_fragments and '#' in url:
437 url, fragment = url.split('#', 1)
438 if '?' in url:
439 url, query = url.split('?', 1)
Steve Dowerdaad2c42019-03-07 09:08:18 -0800440 _checknetloc(netloc)
Oren Milman8df44ee2017-09-03 07:51:39 +0300441 v = SplitResult('http', netloc, url, query, fragment)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000442 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000443 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800444 for c in url[:i]:
445 if c not in scheme_chars:
446 break
447 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300448 # make sure "url" is not actually a port number (in which case
449 # "scheme" is really part of the path)
450 rest = url[i+1:]
451 if not rest or any(c not in '0123456789' for c in rest):
452 # not a port number
453 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800454
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000455 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000456 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000457 if (('[' in netloc and ']' not in netloc) or
458 (']' in netloc and '[' not in netloc)):
459 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800460 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000461 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800462 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000463 url, query = url.split('?', 1)
Steve Dowerdaad2c42019-03-07 09:08:18 -0800464 _checknetloc(netloc)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000465 v = SplitResult(scheme, netloc, url, query, fragment)
466 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000467 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000468
469def urlunparse(components):
470 """Put a parsed URL back together again. This may result in a
471 slightly different, but equivalent URL, if the URL that was parsed
472 originally had redundant delimiters, e.g. a ? with an empty query
473 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000474 scheme, netloc, url, params, query, fragment, _coerce_result = (
475 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000476 if params:
477 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000478 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000479
480def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000481 """Combine the elements of a tuple as returned by urlsplit() into a
482 complete URL as a string. The data argument can be any five-item iterable.
483 This may result in a slightly different, but equivalent URL, if the URL that
484 was parsed originally had unnecessary delimiters (for example, a ? with an
485 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000486 scheme, netloc, url, query, fragment, _coerce_result = (
487 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000488 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
489 if url and url[:1] != '/': url = '/' + url
490 url = '//' + (netloc or '') + url
491 if scheme:
492 url = scheme + ':' + url
493 if query:
494 url = url + '?' + query
495 if fragment:
496 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000497 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000498
499def urljoin(base, url, allow_fragments=True):
500 """Join a base URL and a possibly relative URL to form an absolute
501 interpretation of the latter."""
502 if not base:
503 return url
504 if not url:
505 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400506
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000507 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000508 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
509 urlparse(base, '', allow_fragments)
510 scheme, netloc, path, params, query, fragment = \
511 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400512
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000513 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000514 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000515 if scheme in uses_netloc:
516 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000517 return _coerce_result(urlunparse((scheme, netloc, path,
518 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000519 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400520
Senthil Kumarandca5b862010-12-17 04:48:45 +0000521 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000522 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000523 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000524 if not query:
525 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000526 return _coerce_result(urlunparse((scheme, netloc, path,
527 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400528
529 base_parts = bpath.split('/')
530 if base_parts[-1] != '':
531 # the last item is not a directory, so will not be taken into account
532 # in resolving the relative path
533 del base_parts[-1]
534
535 # for rfc3986, ignore all base path should the first character be root.
536 if path[:1] == '/':
537 segments = path.split('/')
538 else:
539 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800540 # filter out elements that would cause redundant slashes on re-joining
541 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300542 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400543
544 resolved_path = []
545
546 for seg in segments:
547 if seg == '..':
548 try:
549 resolved_path.pop()
550 except IndexError:
551 # ignore any .. segments that would otherwise cause an IndexError
552 # when popped from resolved_path if resolving for rfc3986
553 pass
554 elif seg == '.':
555 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000556 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400557 resolved_path.append(seg)
558
559 if segments[-1] in ('.', '..'):
560 # do some post-processing here. if the last segment was a relative dir,
561 # then we need to append the trailing '/'
562 resolved_path.append('')
563
564 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800565 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400566
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000567
568def urldefrag(url):
569 """Removes any existing fragment from URL.
570
571 Returns a tuple of the defragmented URL and the fragment. If
572 the URL contained no fragments, the second element is the
573 empty string.
574 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000575 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000576 if '#' in url:
577 s, n, p, a, q, frag = urlparse(url)
578 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000579 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000580 frag = ''
581 defrag = url
582 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000583
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200584_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100585_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200586
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000587def unquote_to_bytes(string):
588 """unquote_to_bytes('abc%20def') -> b'abc def'."""
589 # Note: strings are encoded as UTF-8. This is only an issue if it contains
590 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000591 if not string:
592 # Is it a string-like object?
593 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000594 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000595 if isinstance(string, str):
596 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200597 bits = string.split(b'%')
598 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000599 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200600 res = [bits[0]]
601 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100602 # Delay the initialization of the table to not waste memory
603 # if the function is never called
604 global _hextobyte
605 if _hextobyte is None:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +0200606 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100607 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200608 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000609 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200610 append(_hextobyte[item[:2]])
611 append(item[2:])
612 except KeyError:
613 append(b'%')
614 append(item)
615 return b''.join(res)
616
617_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000618
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000619def unquote(string, encoding='utf-8', errors='replace'):
620 """Replace %xx escapes by their single-character equivalent. The optional
621 encoding and errors parameters specify how to decode percent-encoded
622 sequences into Unicode characters, as accepted by the bytes.decode()
623 method.
624 By default, percent-encoded sequences are decoded with UTF-8, and invalid
625 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000626
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000627 unquote('abc%20def') -> 'abc def'.
628 """
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200629 if '%' not in string:
630 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000631 return string
632 if encoding is None:
633 encoding = 'utf-8'
634 if errors is None:
635 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200636 bits = _asciire.split(string)
637 res = [bits[0]]
638 append = res.append
639 for i in range(1, len(bits), 2):
640 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
641 append(bits[i + 1])
642 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000643
Senthil Kumaran257b9802017-04-04 21:19:43 -0700644
Victor Stinnerac71c542011-01-14 12:52:12 +0000645def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
Miss Islington (bot)a66f2792018-10-19 04:11:16 -0700646 encoding='utf-8', errors='replace', max_num_fields=None):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000647 """Parse a query given as a string argument.
648
649 Arguments:
650
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000651 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000652
653 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000654 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000655 A true value indicates that blanks should be retained as
656 blank strings. The default false value indicates that
657 blank values are to be ignored and treated as if they were
658 not included.
659
660 strict_parsing: flag indicating what to do with parsing errors.
661 If false (the default), errors are silently ignored.
662 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000663
664 encoding and errors: specify how to decode percent-encoded sequences
665 into Unicode characters, as accepted by the bytes.decode() method.
Senthil Kumaran257b9802017-04-04 21:19:43 -0700666
Miss Islington (bot)a66f2792018-10-19 04:11:16 -0700667 max_num_fields: int. If set, then throws a ValueError if there
668 are more than n fields read by parse_qsl().
669
Senthil Kumaran257b9802017-04-04 21:19:43 -0700670 Returns a dictionary.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000671 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700672 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000673 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
Miss Islington (bot)a66f2792018-10-19 04:11:16 -0700674 encoding=encoding, errors=errors,
675 max_num_fields=max_num_fields)
Victor Stinnerac71c542011-01-14 12:52:12 +0000676 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700677 if name in parsed_result:
678 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000679 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700680 parsed_result[name] = [value]
681 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000682
Senthil Kumaran257b9802017-04-04 21:19:43 -0700683
Victor Stinnerac71c542011-01-14 12:52:12 +0000684def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
Miss Islington (bot)a66f2792018-10-19 04:11:16 -0700685 encoding='utf-8', errors='replace', max_num_fields=None):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000686 """Parse a query given as a string argument.
687
Senthil Kumaran257b9802017-04-04 21:19:43 -0700688 Arguments:
Facundo Batistac469d4c2008-09-03 22:49:01 +0000689
Senthil Kumaran257b9802017-04-04 21:19:43 -0700690 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000691
Senthil Kumaran257b9802017-04-04 21:19:43 -0700692 keep_blank_values: flag indicating whether blank values in
693 percent-encoded queries should be treated as blank strings.
694 A true value indicates that blanks should be retained as blank
695 strings. The default false value indicates that blank values
696 are to be ignored and treated as if they were not included.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000697
Senthil Kumaran257b9802017-04-04 21:19:43 -0700698 strict_parsing: flag indicating what to do with parsing errors. If
699 false (the default), errors are silently ignored. If true,
700 errors raise a ValueError exception.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000701
Senthil Kumaran257b9802017-04-04 21:19:43 -0700702 encoding and errors: specify how to decode percent-encoded sequences
703 into Unicode characters, as accepted by the bytes.decode() method.
Victor Stinnerac71c542011-01-14 12:52:12 +0000704
Miss Islington (bot)a66f2792018-10-19 04:11:16 -0700705 max_num_fields: int. If set, then throws a ValueError
706 if there are more than n fields read by parse_qsl().
707
Senthil Kumaran257b9802017-04-04 21:19:43 -0700708 Returns a list, as G-d intended.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000709 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000710 qs, _coerce_result = _coerce_args(qs)
Miss Islington (bot)a66f2792018-10-19 04:11:16 -0700711
712 # If max_num_fields is defined then check that the number of fields
713 # is less than max_num_fields. This prevents a memory exhaustion DOS
714 # attack via post bodies with many fields.
715 if max_num_fields is not None:
716 num_fields = 1 + qs.count('&') + qs.count(';')
717 if max_num_fields < num_fields:
718 raise ValueError('Max number of fields exceeded')
719
Facundo Batistac469d4c2008-09-03 22:49:01 +0000720 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
721 r = []
722 for name_value in pairs:
723 if not name_value and not strict_parsing:
724 continue
725 nv = name_value.split('=', 1)
726 if len(nv) != 2:
727 if strict_parsing:
728 raise ValueError("bad query field: %r" % (name_value,))
729 # Handle case of a control-name with no equal sign
730 if keep_blank_values:
731 nv.append('')
732 else:
733 continue
734 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000735 name = nv[0].replace('+', ' ')
736 name = unquote(name, encoding=encoding, errors=errors)
737 name = _coerce_result(name)
738 value = nv[1].replace('+', ' ')
739 value = unquote(value, encoding=encoding, errors=errors)
740 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000741 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000742 return r
743
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000744def unquote_plus(string, encoding='utf-8', errors='replace'):
745 """Like unquote(), but also replace plus signs by spaces, as required for
746 unquoting HTML form values.
747
748 unquote_plus('%7e/abc+def') -> '~/abc def'
749 """
750 string = string.replace('+', ' ')
751 return unquote(string, encoding, errors)
752
753_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
754 b'abcdefghijklmnopqrstuvwxyz'
755 b'0123456789'
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530756 b'_.-~')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000757_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
758_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000759
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000760class Quoter(collections.defaultdict):
761 """A mapping from bytes (in range(0,256)) to strings.
762
763 String values are percent-encoded byte values, unless the key < 128, and
764 in the "safe" set (either the specified safe set, or default set).
765 """
766 # Keeps a cache internally, using defaultdict, for efficiency (lookups
767 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000768 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000769 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000770 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000771
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000772 def __repr__(self):
773 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300774 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000775
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000776 def __missing__(self, b):
777 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000778 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000779 self[b] = res
780 return res
781
782def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000783 """quote('abc def') -> 'abc%20def'
784
785 Each part of a URL, e.g. the path info, the query, etc., has a
Miss Islington (bot)796698a2019-04-09 17:53:03 -0700786 different set of reserved characters that must be quoted. The
787 quote function offers a cautious (not minimal) way to quote a
788 string for most of these parts.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000789
Miss Islington (bot)796698a2019-04-09 17:53:03 -0700790 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
791 the following (un)reserved characters.
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000792
Miss Islington (bot)796698a2019-04-09 17:53:03 -0700793 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
794 reserved = gen-delims / sub-delims
795 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
796 sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
797 / "*" / "+" / "," / ";" / "="
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000798
Miss Islington (bot)796698a2019-04-09 17:53:03 -0700799 Each of the reserved characters is reserved in some component of a URL,
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000800 but not necessarily in all of them.
801
Miss Islington (bot)796698a2019-04-09 17:53:03 -0700802 The quote function %-escapes all characters that are neither in the
803 unreserved chars ("always safe") nor the additional chars set via the
804 safe arg.
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530805
Miss Islington (bot)796698a2019-04-09 17:53:03 -0700806 The default for the safe arg is '/'. The character is reserved, but in
807 typical usage the quote function is being called on a path where the
808 existing slash characters are to be preserved.
809
810 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
811 Now, "~" is included in the set of unreserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000812
R David Murray8c4e1122014-12-24 21:23:18 -0500813 string and safe may be either str or bytes objects. encoding and errors
814 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000815
816 The optional encoding and errors parameters specify how to deal with
817 non-ASCII characters, as accepted by the str.encode method.
818 By default, encoding='utf-8' (characters are encoded with UTF-8), and
819 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000820 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000821 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000822 if not string:
823 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000824 if encoding is None:
825 encoding = 'utf-8'
826 if errors is None:
827 errors = 'strict'
828 string = string.encode(encoding, errors)
829 else:
830 if encoding is not None:
831 raise TypeError("quote() doesn't support 'encoding' for bytes")
832 if errors is not None:
833 raise TypeError("quote() doesn't support 'errors' for bytes")
834 return quote_from_bytes(string, safe)
835
836def quote_plus(string, safe='', encoding=None, errors=None):
837 """Like quote(), but also replace ' ' with '+', as required for quoting
838 HTML form values. Plus signs in the original string are escaped unless
839 they are included in safe. It also does not have safe default to '/'.
840 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000841 # Check if ' ' in string, where string may either be a str or bytes. If
842 # there are no spaces, the regular quote will produce the right answer.
843 if ((isinstance(string, str) and ' ' not in string) or
844 (isinstance(string, bytes) and b' ' not in string)):
845 return quote(string, safe, encoding, errors)
846 if isinstance(safe, str):
847 space = ' '
848 else:
849 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000850 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000851 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000852
853def quote_from_bytes(bs, safe='/'):
854 """Like quote(), but accepts a bytes object rather than a str, and does
855 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800856 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000857 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000858 if not isinstance(bs, (bytes, bytearray)):
859 raise TypeError("quote_from_bytes() expected bytes")
860 if not bs:
861 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000862 if isinstance(safe, str):
863 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
864 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000865 else:
866 safe = bytes([c for c in safe if c < 128])
867 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
868 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000869 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000870 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000871 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000872 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
873 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000874
R David Murrayc17686f2015-05-17 20:44:50 -0400875def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
876 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700877 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000878
879 If any values in the query arg are sequences and doseq is true, each
880 sequence element is converted to a separate parameter.
881
882 If the query arg is a sequence of two-element tuples, the order of the
883 parameters in the output will match the order of parameters in the
884 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000885
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700886 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500887
R David Murrayc17686f2015-05-17 20:44:50 -0400888 The safe, encoding, and errors parameters are passed down to the function
889 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000890 """
891
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000892 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000893 query = query.items()
894 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000895 # It's a bother at times that strings and string-like objects are
896 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000897 try:
898 # non-sequence items should not work with len()
899 # non-empty strings will fail this
900 if len(query) and not isinstance(query[0], tuple):
901 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000902 # Zero-length sequences of all types will get here and succeed,
903 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000904 # allowed empty dicts that type of behavior probably should be
905 # preserved for consistency
906 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000907 ty, va, tb = sys.exc_info()
908 raise TypeError("not a valid non-string sequence "
909 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000910
911 l = []
912 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000913 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000914 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400915 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000916 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400917 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000918
919 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400920 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000921 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400922 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000923 l.append(k + '=' + v)
924 else:
925 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000926 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400927 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000928 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400929 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000930
931 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400932 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000933 l.append(k + '=' + v)
934 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400935 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000936 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000937 else:
938 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000939 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000940 x = len(v)
941 except TypeError:
942 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400943 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000944 l.append(k + '=' + v)
945 else:
946 # loop over the sequence
947 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000948 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400949 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000950 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400951 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000952 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000953 return '&'.join(l)
954
Georg Brandl13e89462008-07-01 19:56:00 +0000955def to_bytes(url):
956 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000957 # Most URL schemes require ASCII. If that changes, the conversion
958 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000959 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000960 if isinstance(url, str):
961 try:
962 url = url.encode("ASCII").decode()
963 except UnicodeError:
964 raise UnicodeError("URL " + repr(url) +
965 " contains non-ASCII characters")
966 return url
967
968def unwrap(url):
969 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
970 url = str(url).strip()
971 if url[:1] == '<' and url[-1:] == '>':
972 url = url[1:-1].strip()
973 if url[:4] == 'URL:': url = url[4:].strip()
974 return url
975
976_typeprog = None
977def splittype(url):
978 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
979 global _typeprog
980 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200981 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000982
983 match = _typeprog.match(url)
984 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200985 scheme, data = match.groups()
986 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000987 return None, url
988
989_hostprog = None
990def splithost(url):
991 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
992 global _hostprog
993 if _hostprog is None:
postmasters90e01e52017-06-20 06:02:44 -0700994 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000995
996 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000997 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200998 host_port, path = match.groups()
999 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +00001000 path = '/' + path
1001 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001002 return None, url
1003
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001004def splituser(host):
1005 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001006 user, delim, host = host.rpartition('@')
1007 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001008
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001009def splitpasswd(user):
1010 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001011 user, delim, passwd = user.partition(':')
1012 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001013
1014# splittag('/path#tag') --> '/path', 'tag'
1015_portprog = None
1016def splitport(host):
1017 """splitport('host:port') --> 'host', 'port'."""
1018 global _portprog
1019 if _portprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001020 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001021
1022 match = _portprog.match(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +02001023 if match:
1024 host, port = match.groups()
1025 if port:
1026 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001027 return host, None
1028
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001029def splitnport(host, defport=-1):
1030 """Split host and port, returning numeric port.
1031 Return given default port if no ':' found; defaults to -1.
1032 Return numerical port if a valid number are found after ':'.
1033 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001034 host, delim, port = host.rpartition(':')
1035 if not delim:
1036 host = port
1037 elif port:
1038 try:
1039 nport = int(port)
1040 except ValueError:
1041 nport = None
1042 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001043 return host, defport
1044
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001045def splitquery(url):
1046 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001047 path, delim, query = url.rpartition('?')
1048 if delim:
1049 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001050 return url, None
1051
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001052def splittag(url):
1053 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001054 path, delim, tag = url.rpartition('#')
1055 if delim:
1056 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001057 return url, None
1058
1059def splitattr(url):
1060 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1061 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1062 words = url.split(';')
1063 return words[0], words[1:]
1064
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001065def splitvalue(attr):
1066 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001067 attr, delim, value = attr.partition('=')
1068 return attr, (value if delim else None)