blob: 3f8cfe5300c024d085f35750a60714a4242b0600 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000033
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000035 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030037 "unquote", "unquote_plus", "unquote_to_bytes",
38 "DefragResult", "ParseResult", "SplitResult",
39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000040
Senthil Kumaran906f5332017-05-17 21:48:59 -070041# A classification of schemes.
42# The empty string classifies URLs with no scheme specified,
43# being the default value returned by “urlsplit” and “urlparse”.
44
45uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran906f5332017-05-17 21:48:59 -070047 'prospero', 'rtsp', 'rtspu', 'sftp',
Berker Peksagf6767482016-09-16 14:43:58 +030048 'svn', 'svn+ssh', 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070049
50uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
Senthil Kumaran906f5332017-05-17 21:48:59 -070052 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
Berker Peksagf6767482016-09-16 14:43:58 +030053 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
54 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070055
56uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000057 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaran906f5332017-05-17 21:48:59 -070058 'mms', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
Georg Brandla61b09f2012-08-24 18:15:29 +020060# These are not actually used anymore, but should stay for backwards
61# compatibility. (They are undocumented, but have a public-looking name.)
Senthil Kumaran906f5332017-05-17 21:48:59 -070062
Georg Brandla61b09f2012-08-24 18:15:29 +020063non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
64 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Senthil Kumaran906f5332017-05-17 21:48:59 -070065
66uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
67 'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
68
69uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandla61b09f2012-08-24 18:15:29 +020070 'nntp', 'wais', 'https', 'shttp', 'snews',
Senthil Kumaran906f5332017-05-17 21:48:59 -070071 'file', 'prospero']
Georg Brandla61b09f2012-08-24 18:15:29 +020072
Jeremy Hylton1afc1692008-06-18 20:49:58 +000073# Characters valid in scheme names
74scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
75 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
76 '0123456789'
77 '+-.')
78
Nick Coghlan9fc443c2010-11-30 15:48:08 +000079# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000080MAX_CACHE_SIZE = 20
81_parse_cache = {}
82
83def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000084 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000085 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000086 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000087
88
Nick Coghlan9fc443c2010-11-30 15:48:08 +000089# Helpers for bytes handling
90# For 3.2, we deliberately require applications that
91# handle improperly quoted URLs to do their own
92# decoding and encoding. If valid use cases are
93# presented, we may relax this by using latin-1
94# decoding internally for 3.3
95_implicit_encoding = 'ascii'
96_implicit_errors = 'strict'
97
98def _noop(obj):
99 return obj
100
101def _encode_result(obj, encoding=_implicit_encoding,
102 errors=_implicit_errors):
103 return obj.encode(encoding, errors)
104
105def _decode_args(args, encoding=_implicit_encoding,
106 errors=_implicit_errors):
107 return tuple(x.decode(encoding, errors) if x else '' for x in args)
108
109def _coerce_args(*args):
110 # Invokes decode if necessary to create str args
111 # and returns the coerced inputs along with
112 # an appropriate result coercion function
113 # - noop for str inputs
114 # - encoding function otherwise
115 str_input = isinstance(args[0], str)
116 for arg in args[1:]:
117 # We special-case the empty string to support the
118 # "scheme=''" default argument to some functions
119 if arg and isinstance(arg, str) != str_input:
120 raise TypeError("Cannot mix str and non-str arguments")
121 if str_input:
122 return args + (_noop,)
123 return _decode_args(args) + (_encode_result,)
124
125# Result objects are more helpful than simple tuples
126class _ResultMixinStr(object):
127 """Standard approach to encoding parsed results from str to bytes"""
128 __slots__ = ()
129
130 def encode(self, encoding='ascii', errors='strict'):
131 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
132
133
134class _ResultMixinBytes(object):
135 """Standard approach to decoding parsed results from bytes to str"""
136 __slots__ = ()
137
138 def decode(self, encoding='ascii', errors='strict'):
139 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
140
141
142class _NetlocResultMixinBase(object):
143 """Shared methods for the parsed result objects containing a netloc element"""
144 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000145
146 @property
147 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000148 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000149
150 @property
151 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000152 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000153
154 @property
155 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000156 hostname = self._hostinfo[0]
157 if not hostname:
Коренберг Маркfbd60512017-12-21 17:16:17 +0500158 return None
159 # Scoped IPv6 address may have zone info, which must not be lowercased
160 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
161 separator = '%' if isinstance(hostname, str) else b'%'
162 hostname, percent, zone = hostname.partition(separator)
163 return hostname.lower() + percent + zone
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000164
165 @property
166 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000167 port = self._hostinfo[1]
168 if port is not None:
Matt Eaton2cb46612018-03-20 01:41:37 -0500169 try:
170 port = int(port, 10)
171 except ValueError:
172 message = f'Port could not be cast to integer value as {port!r}'
173 raise ValueError(message) from None
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800174 if not ( 0 <= port <= 65535):
Robert Collinsdfa95c92015-08-10 09:53:30 +1200175 raise ValueError("Port out of range 0-65535")
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000176 return port
177
178
179class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
180 __slots__ = ()
181
182 @property
183 def _userinfo(self):
184 netloc = self.netloc
185 userinfo, have_info, hostinfo = netloc.rpartition('@')
186 if have_info:
187 username, have_password, password = userinfo.partition(':')
188 if not have_password:
189 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000190 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000191 username = password = None
192 return username, password
193
194 @property
195 def _hostinfo(self):
196 netloc = self.netloc
197 _, _, hostinfo = netloc.rpartition('@')
198 _, have_open_br, bracketed = hostinfo.partition('[')
199 if have_open_br:
200 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200201 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000202 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200203 hostname, _, port = hostinfo.partition(':')
204 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000205 port = None
206 return hostname, port
207
208
209class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
210 __slots__ = ()
211
212 @property
213 def _userinfo(self):
214 netloc = self.netloc
215 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
216 if have_info:
217 username, have_password, password = userinfo.partition(b':')
218 if not have_password:
219 password = None
220 else:
221 username = password = None
222 return username, password
223
224 @property
225 def _hostinfo(self):
226 netloc = self.netloc
227 _, _, hostinfo = netloc.rpartition(b'@')
228 _, have_open_br, bracketed = hostinfo.partition(b'[')
229 if have_open_br:
230 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200231 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000232 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200233 hostname, _, port = hostinfo.partition(b':')
234 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000235 port = None
236 return hostname, port
237
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000238
239from collections import namedtuple
240
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000241_DefragResultBase = namedtuple('DefragResult', 'url fragment')
Senthil Kumaran86f71092016-01-14 00:11:39 -0800242_SplitResultBase = namedtuple(
243 'SplitResult', 'scheme netloc path query fragment')
244_ParseResultBase = namedtuple(
245 'ParseResult', 'scheme netloc path params query fragment')
246
247_DefragResultBase.__doc__ = """
248DefragResult(url, fragment)
249
250A 2-tuple that contains the url without fragment identifier and the fragment
251identifier as a separate argument.
252"""
253
254_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
255
256_DefragResultBase.fragment.__doc__ = """
257Fragment identifier separated from URL, that allows indirect identification of a
258secondary resource by reference to a primary resource and additional identifying
259information.
260"""
261
262_SplitResultBase.__doc__ = """
263SplitResult(scheme, netloc, path, query, fragment)
264
265A 5-tuple that contains the different components of a URL. Similar to
266ParseResult, but does not split params.
267"""
268
269_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
270
271_SplitResultBase.netloc.__doc__ = """
272Network location where the request is made to.
273"""
274
275_SplitResultBase.path.__doc__ = """
276The hierarchical path, such as the path to a file to download.
277"""
278
279_SplitResultBase.query.__doc__ = """
280The query component, that contains non-hierarchical data, that along with data
281in path component, identifies a resource in the scope of URI's scheme and
282network location.
283"""
284
285_SplitResultBase.fragment.__doc__ = """
286Fragment identifier, that allows indirect identification of a secondary resource
287by reference to a primary resource and additional identifying information.
288"""
289
290_ParseResultBase.__doc__ = """
291ParseResult(scheme, netloc, path, params, query, fragment)
292
293A 6-tuple that contains components of a parsed URL.
294"""
295
296_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
297_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
298_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
299_ParseResultBase.params.__doc__ = """
300Parameters for last path element used to dereference the URI in order to provide
301access to perform some operation on the resource.
302"""
303
304_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
305_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
306
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000307
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000308# For backwards compatibility, alias _NetlocResultMixinStr
309# ResultBase is no longer part of the documented API, but it is
310# retained since deprecating it isn't worth the hassle
311ResultBase = _NetlocResultMixinStr
312
313# Structured result objects for string data
314class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000315 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000316 def geturl(self):
317 if self.fragment:
318 return self.url + '#' + self.fragment
319 else:
320 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000321
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000322class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
323 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000324 def geturl(self):
325 return urlunsplit(self)
326
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000327class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000328 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000329 def geturl(self):
330 return urlunparse(self)
331
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000332# Structured result objects for bytes data
333class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
334 __slots__ = ()
335 def geturl(self):
336 if self.fragment:
337 return self.url + b'#' + self.fragment
338 else:
339 return self.url
340
341class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
342 __slots__ = ()
343 def geturl(self):
344 return urlunsplit(self)
345
346class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
347 __slots__ = ()
348 def geturl(self):
349 return urlunparse(self)
350
351# Set up the encode/decode result pairs
352def _fix_result_transcoding():
353 _result_pairs = (
354 (DefragResult, DefragResultBytes),
355 (SplitResult, SplitResultBytes),
356 (ParseResult, ParseResultBytes),
357 )
358 for _decoded, _encoded in _result_pairs:
359 _decoded._encoded_counterpart = _encoded
360 _encoded._decoded_counterpart = _decoded
361
362_fix_result_transcoding()
363del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000364
365def urlparse(url, scheme='', allow_fragments=True):
366 """Parse a URL into 6 components:
367 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
368 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
369 Note that we don't break the components up in smaller bits
370 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000371 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700372 splitresult = urlsplit(url, scheme, allow_fragments)
373 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000374 if scheme in uses_params and ';' in url:
375 url, params = _splitparams(url)
376 else:
377 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000378 result = ParseResult(scheme, netloc, url, params, query, fragment)
379 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000380
381def _splitparams(url):
382 if '/' in url:
383 i = url.find(';', url.rfind('/'))
384 if i < 0:
385 return url, ''
386 else:
387 i = url.find(';')
388 return url[:i], url[i+1:]
389
390def _splitnetloc(url, start=0):
391 delim = len(url) # position of end of domain part of url, default is end
392 for c in '/?#': # look for delimiters; the order is NOT important
393 wdelim = url.find(c, start) # find first of this delim
394 if wdelim >= 0: # if found
395 delim = min(delim, wdelim) # use earliest delim position
396 return url[start:delim], url[delim:] # return (domain, rest)
397
398def urlsplit(url, scheme='', allow_fragments=True):
399 """Parse a URL into 5 components:
400 <scheme>://<netloc>/<path>?<query>#<fragment>
401 Return a 5-tuple: (scheme, netloc, path, query, fragment).
402 Note that we don't break the components up in smaller bits
403 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000404 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000405 allow_fragments = bool(allow_fragments)
406 key = url, scheme, allow_fragments, type(url), type(scheme)
407 cached = _parse_cache.get(key, None)
408 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000409 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000410 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
411 clear_cache()
412 netloc = query = fragment = ''
413 i = url.find(':')
414 if i > 0:
415 if url[:i] == 'http': # optimize the common case
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000416 url = url[i+1:]
417 if url[:2] == '//':
418 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000419 if (('[' in netloc and ']' not in netloc) or
420 (']' in netloc and '[' not in netloc)):
421 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000422 if allow_fragments and '#' in url:
423 url, fragment = url.split('#', 1)
424 if '?' in url:
425 url, query = url.split('?', 1)
Oren Milman8df44ee2017-09-03 07:51:39 +0300426 v = SplitResult('http', netloc, url, query, fragment)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000427 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000428 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800429 for c in url[:i]:
430 if c not in scheme_chars:
431 break
432 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300433 # make sure "url" is not actually a port number (in which case
434 # "scheme" is really part of the path)
435 rest = url[i+1:]
436 if not rest or any(c not in '0123456789' for c in rest):
437 # not a port number
438 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800439
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000440 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000441 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000442 if (('[' in netloc and ']' not in netloc) or
443 (']' in netloc and '[' not in netloc)):
444 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800445 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000446 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800447 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000448 url, query = url.split('?', 1)
449 v = SplitResult(scheme, netloc, url, query, fragment)
450 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000451 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000452
453def urlunparse(components):
454 """Put a parsed URL back together again. This may result in a
455 slightly different, but equivalent URL, if the URL that was parsed
456 originally had redundant delimiters, e.g. a ? with an empty query
457 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000458 scheme, netloc, url, params, query, fragment, _coerce_result = (
459 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000460 if params:
461 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000462 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000463
464def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000465 """Combine the elements of a tuple as returned by urlsplit() into a
466 complete URL as a string. The data argument can be any five-item iterable.
467 This may result in a slightly different, but equivalent URL, if the URL that
468 was parsed originally had unnecessary delimiters (for example, a ? with an
469 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000470 scheme, netloc, url, query, fragment, _coerce_result = (
471 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000472 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
473 if url and url[:1] != '/': url = '/' + url
474 url = '//' + (netloc or '') + url
475 if scheme:
476 url = scheme + ':' + url
477 if query:
478 url = url + '?' + query
479 if fragment:
480 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000481 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000482
483def urljoin(base, url, allow_fragments=True):
484 """Join a base URL and a possibly relative URL to form an absolute
485 interpretation of the latter."""
486 if not base:
487 return url
488 if not url:
489 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400490
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000491 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000492 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
493 urlparse(base, '', allow_fragments)
494 scheme, netloc, path, params, query, fragment = \
495 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400496
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000497 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000498 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000499 if scheme in uses_netloc:
500 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000501 return _coerce_result(urlunparse((scheme, netloc, path,
502 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000503 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400504
Senthil Kumarandca5b862010-12-17 04:48:45 +0000505 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000506 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000507 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000508 if not query:
509 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000510 return _coerce_result(urlunparse((scheme, netloc, path,
511 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400512
513 base_parts = bpath.split('/')
514 if base_parts[-1] != '':
515 # the last item is not a directory, so will not be taken into account
516 # in resolving the relative path
517 del base_parts[-1]
518
519 # for rfc3986, ignore all base path should the first character be root.
520 if path[:1] == '/':
521 segments = path.split('/')
522 else:
523 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800524 # filter out elements that would cause redundant slashes on re-joining
525 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300526 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400527
528 resolved_path = []
529
530 for seg in segments:
531 if seg == '..':
532 try:
533 resolved_path.pop()
534 except IndexError:
535 # ignore any .. segments that would otherwise cause an IndexError
536 # when popped from resolved_path if resolving for rfc3986
537 pass
538 elif seg == '.':
539 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000540 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400541 resolved_path.append(seg)
542
543 if segments[-1] in ('.', '..'):
544 # do some post-processing here. if the last segment was a relative dir,
545 # then we need to append the trailing '/'
546 resolved_path.append('')
547
548 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800549 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400550
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000551
552def urldefrag(url):
553 """Removes any existing fragment from URL.
554
555 Returns a tuple of the defragmented URL and the fragment. If
556 the URL contained no fragments, the second element is the
557 empty string.
558 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000559 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000560 if '#' in url:
561 s, n, p, a, q, frag = urlparse(url)
562 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000563 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000564 frag = ''
565 defrag = url
566 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000567
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200568_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100569_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200570
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000571def unquote_to_bytes(string):
572 """unquote_to_bytes('abc%20def') -> b'abc def'."""
573 # Note: strings are encoded as UTF-8. This is only an issue if it contains
574 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000575 if not string:
576 # Is it a string-like object?
577 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000578 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000579 if isinstance(string, str):
580 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200581 bits = string.split(b'%')
582 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000583 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200584 res = [bits[0]]
585 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100586 # Delay the initialization of the table to not waste memory
587 # if the function is never called
588 global _hextobyte
589 if _hextobyte is None:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +0200590 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100591 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200592 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000593 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200594 append(_hextobyte[item[:2]])
595 append(item[2:])
596 except KeyError:
597 append(b'%')
598 append(item)
599 return b''.join(res)
600
601_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000602
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000603def unquote(string, encoding='utf-8', errors='replace'):
604 """Replace %xx escapes by their single-character equivalent. The optional
605 encoding and errors parameters specify how to decode percent-encoded
606 sequences into Unicode characters, as accepted by the bytes.decode()
607 method.
608 By default, percent-encoded sequences are decoded with UTF-8, and invalid
609 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000610
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000611 unquote('abc%20def') -> 'abc def'.
612 """
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200613 if '%' not in string:
614 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000615 return string
616 if encoding is None:
617 encoding = 'utf-8'
618 if errors is None:
619 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200620 bits = _asciire.split(string)
621 res = [bits[0]]
622 append = res.append
623 for i in range(1, len(bits), 2):
624 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
625 append(bits[i + 1])
626 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000627
Senthil Kumaran257b9802017-04-04 21:19:43 -0700628
Victor Stinnerac71c542011-01-14 12:52:12 +0000629def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
630 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000631 """Parse a query given as a string argument.
632
633 Arguments:
634
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000635 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000636
637 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000638 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000639 A true value indicates that blanks should be retained as
640 blank strings. The default false value indicates that
641 blank values are to be ignored and treated as if they were
642 not included.
643
644 strict_parsing: flag indicating what to do with parsing errors.
645 If false (the default), errors are silently ignored.
646 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000647
648 encoding and errors: specify how to decode percent-encoded sequences
649 into Unicode characters, as accepted by the bytes.decode() method.
Senthil Kumaran257b9802017-04-04 21:19:43 -0700650
651 Returns a dictionary.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000652 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700653 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000654 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
655 encoding=encoding, errors=errors)
656 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700657 if name in parsed_result:
658 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000659 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700660 parsed_result[name] = [value]
661 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000662
Senthil Kumaran257b9802017-04-04 21:19:43 -0700663
Victor Stinnerac71c542011-01-14 12:52:12 +0000664def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
665 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000666 """Parse a query given as a string argument.
667
Senthil Kumaran257b9802017-04-04 21:19:43 -0700668 Arguments:
Facundo Batistac469d4c2008-09-03 22:49:01 +0000669
Senthil Kumaran257b9802017-04-04 21:19:43 -0700670 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000671
Senthil Kumaran257b9802017-04-04 21:19:43 -0700672 keep_blank_values: flag indicating whether blank values in
673 percent-encoded queries should be treated as blank strings.
674 A true value indicates that blanks should be retained as blank
675 strings. The default false value indicates that blank values
676 are to be ignored and treated as if they were not included.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000677
Senthil Kumaran257b9802017-04-04 21:19:43 -0700678 strict_parsing: flag indicating what to do with parsing errors. If
679 false (the default), errors are silently ignored. If true,
680 errors raise a ValueError exception.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000681
Senthil Kumaran257b9802017-04-04 21:19:43 -0700682 encoding and errors: specify how to decode percent-encoded sequences
683 into Unicode characters, as accepted by the bytes.decode() method.
Victor Stinnerac71c542011-01-14 12:52:12 +0000684
Senthil Kumaran257b9802017-04-04 21:19:43 -0700685 Returns a list, as G-d intended.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000686 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000687 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000688 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
689 r = []
690 for name_value in pairs:
691 if not name_value and not strict_parsing:
692 continue
693 nv = name_value.split('=', 1)
694 if len(nv) != 2:
695 if strict_parsing:
696 raise ValueError("bad query field: %r" % (name_value,))
697 # Handle case of a control-name with no equal sign
698 if keep_blank_values:
699 nv.append('')
700 else:
701 continue
702 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000703 name = nv[0].replace('+', ' ')
704 name = unquote(name, encoding=encoding, errors=errors)
705 name = _coerce_result(name)
706 value = nv[1].replace('+', ' ')
707 value = unquote(value, encoding=encoding, errors=errors)
708 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000709 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000710 return r
711
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000712def unquote_plus(string, encoding='utf-8', errors='replace'):
713 """Like unquote(), but also replace plus signs by spaces, as required for
714 unquoting HTML form values.
715
716 unquote_plus('%7e/abc+def') -> '~/abc def'
717 """
718 string = string.replace('+', ' ')
719 return unquote(string, encoding, errors)
720
721_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
722 b'abcdefghijklmnopqrstuvwxyz'
723 b'0123456789'
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530724 b'_.-~')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000725_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
726_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000727
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000728class Quoter(collections.defaultdict):
729 """A mapping from bytes (in range(0,256)) to strings.
730
731 String values are percent-encoded byte values, unless the key < 128, and
732 in the "safe" set (either the specified safe set, or default set).
733 """
734 # Keeps a cache internally, using defaultdict, for efficiency (lookups
735 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000736 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000737 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000738 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000739
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000740 def __repr__(self):
741 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300742 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000743
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000744 def __missing__(self, b):
745 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000746 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000747 self[b] = res
748 return res
749
750def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000751 """quote('abc def') -> 'abc%20def'
752
753 Each part of a URL, e.g. the path info, the query, etc., has a
754 different set of reserved characters that must be quoted.
755
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530756 RFC 3986 Uniform Resource Identifiers (URI): Generic Syntax lists
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000757 the following reserved characters.
758
759 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530760 "$" | "," | "~"
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000761
762 Each of these characters is reserved in some component of a URL,
763 but not necessarily in all of them.
764
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530765 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
766 Now, "~" is included in the set of reserved characters.
767
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000768 By default, the quote function is intended for quoting the path
769 section of a URL. Thus, it will not encode '/'. This character
770 is reserved, but in typical usage the quote function is being
771 called on a path where the existing slash characters are used as
772 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000773
R David Murray8c4e1122014-12-24 21:23:18 -0500774 string and safe may be either str or bytes objects. encoding and errors
775 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000776
777 The optional encoding and errors parameters specify how to deal with
778 non-ASCII characters, as accepted by the str.encode method.
779 By default, encoding='utf-8' (characters are encoded with UTF-8), and
780 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000781 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000782 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000783 if not string:
784 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000785 if encoding is None:
786 encoding = 'utf-8'
787 if errors is None:
788 errors = 'strict'
789 string = string.encode(encoding, errors)
790 else:
791 if encoding is not None:
792 raise TypeError("quote() doesn't support 'encoding' for bytes")
793 if errors is not None:
794 raise TypeError("quote() doesn't support 'errors' for bytes")
795 return quote_from_bytes(string, safe)
796
797def quote_plus(string, safe='', encoding=None, errors=None):
798 """Like quote(), but also replace ' ' with '+', as required for quoting
799 HTML form values. Plus signs in the original string are escaped unless
800 they are included in safe. It also does not have safe default to '/'.
801 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000802 # Check if ' ' in string, where string may either be a str or bytes. If
803 # there are no spaces, the regular quote will produce the right answer.
804 if ((isinstance(string, str) and ' ' not in string) or
805 (isinstance(string, bytes) and b' ' not in string)):
806 return quote(string, safe, encoding, errors)
807 if isinstance(safe, str):
808 space = ' '
809 else:
810 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000811 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000812 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000813
814def quote_from_bytes(bs, safe='/'):
815 """Like quote(), but accepts a bytes object rather than a str, and does
816 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800817 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000818 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000819 if not isinstance(bs, (bytes, bytearray)):
820 raise TypeError("quote_from_bytes() expected bytes")
821 if not bs:
822 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000823 if isinstance(safe, str):
824 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
825 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000826 else:
827 safe = bytes([c for c in safe if c < 128])
828 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
829 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000830 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000831 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000832 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000833 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
834 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000835
R David Murrayc17686f2015-05-17 20:44:50 -0400836def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
837 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700838 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000839
840 If any values in the query arg are sequences and doseq is true, each
841 sequence element is converted to a separate parameter.
842
843 If the query arg is a sequence of two-element tuples, the order of the
844 parameters in the output will match the order of parameters in the
845 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000846
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700847 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500848
R David Murrayc17686f2015-05-17 20:44:50 -0400849 The safe, encoding, and errors parameters are passed down to the function
850 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000851 """
852
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000853 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000854 query = query.items()
855 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000856 # It's a bother at times that strings and string-like objects are
857 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000858 try:
859 # non-sequence items should not work with len()
860 # non-empty strings will fail this
861 if len(query) and not isinstance(query[0], tuple):
862 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000863 # Zero-length sequences of all types will get here and succeed,
864 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000865 # allowed empty dicts that type of behavior probably should be
866 # preserved for consistency
867 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000868 ty, va, tb = sys.exc_info()
869 raise TypeError("not a valid non-string sequence "
870 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000871
872 l = []
873 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000874 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000875 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400876 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000877 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400878 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000879
880 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400881 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000882 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400883 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000884 l.append(k + '=' + v)
885 else:
886 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000887 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400888 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000889 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400890 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000891
892 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400893 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000894 l.append(k + '=' + v)
895 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400896 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000897 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000898 else:
899 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000900 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901 x = len(v)
902 except TypeError:
903 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400904 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000905 l.append(k + '=' + v)
906 else:
907 # loop over the sequence
908 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000909 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400910 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000911 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400912 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000913 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000914 return '&'.join(l)
915
Georg Brandl13e89462008-07-01 19:56:00 +0000916def to_bytes(url):
917 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000918 # Most URL schemes require ASCII. If that changes, the conversion
919 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000920 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000921 if isinstance(url, str):
922 try:
923 url = url.encode("ASCII").decode()
924 except UnicodeError:
925 raise UnicodeError("URL " + repr(url) +
926 " contains non-ASCII characters")
927 return url
928
929def unwrap(url):
930 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
931 url = str(url).strip()
932 if url[:1] == '<' and url[-1:] == '>':
933 url = url[1:-1].strip()
934 if url[:4] == 'URL:': url = url[4:].strip()
935 return url
936
937_typeprog = None
938def splittype(url):
939 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
940 global _typeprog
941 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200942 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000943
944 match = _typeprog.match(url)
945 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200946 scheme, data = match.groups()
947 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000948 return None, url
949
950_hostprog = None
951def splithost(url):
952 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
953 global _hostprog
954 if _hostprog is None:
postmasters90e01e52017-06-20 06:02:44 -0700955 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000956
957 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000958 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200959 host_port, path = match.groups()
960 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +0000961 path = '/' + path
962 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000963 return None, url
964
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000965def splituser(host):
966 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200967 user, delim, host = host.rpartition('@')
968 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000969
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000970def splitpasswd(user):
971 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200972 user, delim, passwd = user.partition(':')
973 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000974
975# splittag('/path#tag') --> '/path', 'tag'
976_portprog = None
977def splitport(host):
978 """splitport('host:port') --> 'host', 'port'."""
979 global _portprog
980 if _portprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200981 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000982
983 match = _portprog.match(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200984 if match:
985 host, port = match.groups()
986 if port:
987 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000988 return host, None
989
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000990def splitnport(host, defport=-1):
991 """Split host and port, returning numeric port.
992 Return given default port if no ':' found; defaults to -1.
993 Return numerical port if a valid number are found after ':'.
994 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200995 host, delim, port = host.rpartition(':')
996 if not delim:
997 host = port
998 elif port:
999 try:
1000 nport = int(port)
1001 except ValueError:
1002 nport = None
1003 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001004 return host, defport
1005
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001006def splitquery(url):
1007 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001008 path, delim, query = url.rpartition('?')
1009 if delim:
1010 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001011 return url, None
1012
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001013def splittag(url):
1014 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001015 path, delim, tag = url.rpartition('#')
1016 if delim:
1017 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018 return url, None
1019
1020def splitattr(url):
1021 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1022 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1023 words = url.split(';')
1024 return words[0], words[1:]
1025
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001026def splitvalue(attr):
1027 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001028 attr, delim, value = attr.partition('=')
1029 return attr, (value if delim else None)