blob: 76086e58beaff06dd8cfc54cb0fef645f159d7f3 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000033
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000035 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030037 "unquote", "unquote_plus", "unquote_to_bytes",
38 "DefragResult", "ParseResult", "SplitResult",
39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000040
Senthil Kumaran906f5332017-05-17 21:48:59 -070041# A classification of schemes.
42# The empty string classifies URLs with no scheme specified,
43# being the default value returned by “urlsplit” and “urlparse”.
44
45uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran906f5332017-05-17 21:48:59 -070047 'prospero', 'rtsp', 'rtspu', 'sftp',
Berker Peksagf6767482016-09-16 14:43:58 +030048 'svn', 'svn+ssh', 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070049
50uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
Senthil Kumaran906f5332017-05-17 21:48:59 -070052 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
Berker Peksagf6767482016-09-16 14:43:58 +030053 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
54 'ws', 'wss']
Senthil Kumaran906f5332017-05-17 21:48:59 -070055
56uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
Jeremy Hylton1afc1692008-06-18 20:49:58 +000057 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaran906f5332017-05-17 21:48:59 -070058 'mms', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
Georg Brandla61b09f2012-08-24 18:15:29 +020060# These are not actually used anymore, but should stay for backwards
61# compatibility. (They are undocumented, but have a public-looking name.)
Senthil Kumaran906f5332017-05-17 21:48:59 -070062
Georg Brandla61b09f2012-08-24 18:15:29 +020063non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
64 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Senthil Kumaran906f5332017-05-17 21:48:59 -070065
66uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
67 'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
68
69uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandla61b09f2012-08-24 18:15:29 +020070 'nntp', 'wais', 'https', 'shttp', 'snews',
Senthil Kumaran906f5332017-05-17 21:48:59 -070071 'file', 'prospero']
Georg Brandla61b09f2012-08-24 18:15:29 +020072
Jeremy Hylton1afc1692008-06-18 20:49:58 +000073# Characters valid in scheme names
74scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
75 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
76 '0123456789'
77 '+-.')
78
Nick Coghlan9fc443c2010-11-30 15:48:08 +000079# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000080MAX_CACHE_SIZE = 20
81_parse_cache = {}
82
83def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000084 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000085 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000086 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000087
88
Nick Coghlan9fc443c2010-11-30 15:48:08 +000089# Helpers for bytes handling
90# For 3.2, we deliberately require applications that
91# handle improperly quoted URLs to do their own
92# decoding and encoding. If valid use cases are
93# presented, we may relax this by using latin-1
94# decoding internally for 3.3
95_implicit_encoding = 'ascii'
96_implicit_errors = 'strict'
97
98def _noop(obj):
99 return obj
100
101def _encode_result(obj, encoding=_implicit_encoding,
102 errors=_implicit_errors):
103 return obj.encode(encoding, errors)
104
105def _decode_args(args, encoding=_implicit_encoding,
106 errors=_implicit_errors):
107 return tuple(x.decode(encoding, errors) if x else '' for x in args)
108
109def _coerce_args(*args):
110 # Invokes decode if necessary to create str args
111 # and returns the coerced inputs along with
112 # an appropriate result coercion function
113 # - noop for str inputs
114 # - encoding function otherwise
115 str_input = isinstance(args[0], str)
116 for arg in args[1:]:
117 # We special-case the empty string to support the
118 # "scheme=''" default argument to some functions
119 if arg and isinstance(arg, str) != str_input:
120 raise TypeError("Cannot mix str and non-str arguments")
121 if str_input:
122 return args + (_noop,)
123 return _decode_args(args) + (_encode_result,)
124
125# Result objects are more helpful than simple tuples
126class _ResultMixinStr(object):
127 """Standard approach to encoding parsed results from str to bytes"""
128 __slots__ = ()
129
130 def encode(self, encoding='ascii', errors='strict'):
131 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
132
133
134class _ResultMixinBytes(object):
135 """Standard approach to decoding parsed results from bytes to str"""
136 __slots__ = ()
137
138 def decode(self, encoding='ascii', errors='strict'):
139 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
140
141
142class _NetlocResultMixinBase(object):
143 """Shared methods for the parsed result objects containing a netloc element"""
144 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000145
146 @property
147 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000148 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000149
150 @property
151 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000152 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000153
154 @property
155 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000156 hostname = self._hostinfo[0]
157 if not hostname:
158 hostname = None
159 elif hostname is not None:
160 hostname = hostname.lower()
161 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000162
163 @property
164 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000165 port = self._hostinfo[1]
166 if port is not None:
167 port = int(port, 10)
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800168 if not ( 0 <= port <= 65535):
Robert Collinsdfa95c92015-08-10 09:53:30 +1200169 raise ValueError("Port out of range 0-65535")
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000170 return port
171
172
173class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
174 __slots__ = ()
175
176 @property
177 def _userinfo(self):
178 netloc = self.netloc
179 userinfo, have_info, hostinfo = netloc.rpartition('@')
180 if have_info:
181 username, have_password, password = userinfo.partition(':')
182 if not have_password:
183 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000184 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000185 username = password = None
186 return username, password
187
188 @property
189 def _hostinfo(self):
190 netloc = self.netloc
191 _, _, hostinfo = netloc.rpartition('@')
192 _, have_open_br, bracketed = hostinfo.partition('[')
193 if have_open_br:
194 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200195 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000196 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200197 hostname, _, port = hostinfo.partition(':')
198 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000199 port = None
200 return hostname, port
201
202
203class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
204 __slots__ = ()
205
206 @property
207 def _userinfo(self):
208 netloc = self.netloc
209 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
210 if have_info:
211 username, have_password, password = userinfo.partition(b':')
212 if not have_password:
213 password = None
214 else:
215 username = password = None
216 return username, password
217
218 @property
219 def _hostinfo(self):
220 netloc = self.netloc
221 _, _, hostinfo = netloc.rpartition(b'@')
222 _, have_open_br, bracketed = hostinfo.partition(b'[')
223 if have_open_br:
224 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200225 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000226 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200227 hostname, _, port = hostinfo.partition(b':')
228 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000229 port = None
230 return hostname, port
231
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000232
233from collections import namedtuple
234
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000235_DefragResultBase = namedtuple('DefragResult', 'url fragment')
Senthil Kumaran86f71092016-01-14 00:11:39 -0800236_SplitResultBase = namedtuple(
237 'SplitResult', 'scheme netloc path query fragment')
238_ParseResultBase = namedtuple(
239 'ParseResult', 'scheme netloc path params query fragment')
240
241_DefragResultBase.__doc__ = """
242DefragResult(url, fragment)
243
244A 2-tuple that contains the url without fragment identifier and the fragment
245identifier as a separate argument.
246"""
247
248_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
249
250_DefragResultBase.fragment.__doc__ = """
251Fragment identifier separated from URL, that allows indirect identification of a
252secondary resource by reference to a primary resource and additional identifying
253information.
254"""
255
256_SplitResultBase.__doc__ = """
257SplitResult(scheme, netloc, path, query, fragment)
258
259A 5-tuple that contains the different components of a URL. Similar to
260ParseResult, but does not split params.
261"""
262
263_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
264
265_SplitResultBase.netloc.__doc__ = """
266Network location where the request is made to.
267"""
268
269_SplitResultBase.path.__doc__ = """
270The hierarchical path, such as the path to a file to download.
271"""
272
273_SplitResultBase.query.__doc__ = """
274The query component, that contains non-hierarchical data, that along with data
275in path component, identifies a resource in the scope of URI's scheme and
276network location.
277"""
278
279_SplitResultBase.fragment.__doc__ = """
280Fragment identifier, that allows indirect identification of a secondary resource
281by reference to a primary resource and additional identifying information.
282"""
283
284_ParseResultBase.__doc__ = """
285ParseResult(scheme, netloc, path, params, query, fragment)
286
287A 6-tuple that contains components of a parsed URL.
288"""
289
290_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
291_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
292_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
293_ParseResultBase.params.__doc__ = """
294Parameters for last path element used to dereference the URI in order to provide
295access to perform some operation on the resource.
296"""
297
298_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
299_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
300
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000301
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000302# For backwards compatibility, alias _NetlocResultMixinStr
303# ResultBase is no longer part of the documented API, but it is
304# retained since deprecating it isn't worth the hassle
305ResultBase = _NetlocResultMixinStr
306
307# Structured result objects for string data
308class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000309 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000310 def geturl(self):
311 if self.fragment:
312 return self.url + '#' + self.fragment
313 else:
314 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000315
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000316class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
317 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000318 def geturl(self):
319 return urlunsplit(self)
320
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000321class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000322 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000323 def geturl(self):
324 return urlunparse(self)
325
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000326# Structured result objects for bytes data
327class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
328 __slots__ = ()
329 def geturl(self):
330 if self.fragment:
331 return self.url + b'#' + self.fragment
332 else:
333 return self.url
334
335class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
336 __slots__ = ()
337 def geturl(self):
338 return urlunsplit(self)
339
340class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
341 __slots__ = ()
342 def geturl(self):
343 return urlunparse(self)
344
345# Set up the encode/decode result pairs
346def _fix_result_transcoding():
347 _result_pairs = (
348 (DefragResult, DefragResultBytes),
349 (SplitResult, SplitResultBytes),
350 (ParseResult, ParseResultBytes),
351 )
352 for _decoded, _encoded in _result_pairs:
353 _decoded._encoded_counterpart = _encoded
354 _encoded._decoded_counterpart = _decoded
355
356_fix_result_transcoding()
357del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000358
359def urlparse(url, scheme='', allow_fragments=True):
360 """Parse a URL into 6 components:
361 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
362 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
363 Note that we don't break the components up in smaller bits
364 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000365 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700366 splitresult = urlsplit(url, scheme, allow_fragments)
367 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000368 if scheme in uses_params and ';' in url:
369 url, params = _splitparams(url)
370 else:
371 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000372 result = ParseResult(scheme, netloc, url, params, query, fragment)
373 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000374
375def _splitparams(url):
376 if '/' in url:
377 i = url.find(';', url.rfind('/'))
378 if i < 0:
379 return url, ''
380 else:
381 i = url.find(';')
382 return url[:i], url[i+1:]
383
384def _splitnetloc(url, start=0):
385 delim = len(url) # position of end of domain part of url, default is end
386 for c in '/?#': # look for delimiters; the order is NOT important
387 wdelim = url.find(c, start) # find first of this delim
388 if wdelim >= 0: # if found
389 delim = min(delim, wdelim) # use earliest delim position
390 return url[start:delim], url[delim:] # return (domain, rest)
391
392def urlsplit(url, scheme='', allow_fragments=True):
393 """Parse a URL into 5 components:
394 <scheme>://<netloc>/<path>?<query>#<fragment>
395 Return a 5-tuple: (scheme, netloc, path, query, fragment).
396 Note that we don't break the components up in smaller bits
397 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000398 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000399 allow_fragments = bool(allow_fragments)
400 key = url, scheme, allow_fragments, type(url), type(scheme)
401 cached = _parse_cache.get(key, None)
402 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000403 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000404 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
405 clear_cache()
406 netloc = query = fragment = ''
407 i = url.find(':')
408 if i > 0:
409 if url[:i] == 'http': # optimize the common case
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000410 url = url[i+1:]
411 if url[:2] == '//':
412 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000413 if (('[' in netloc and ']' not in netloc) or
414 (']' in netloc and '[' not in netloc)):
415 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000416 if allow_fragments and '#' in url:
417 url, fragment = url.split('#', 1)
418 if '?' in url:
419 url, query = url.split('?', 1)
Oren Milman8df44ee2017-09-03 07:51:39 +0300420 v = SplitResult('http', netloc, url, query, fragment)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000421 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000422 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800423 for c in url[:i]:
424 if c not in scheme_chars:
425 break
426 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300427 # make sure "url" is not actually a port number (in which case
428 # "scheme" is really part of the path)
429 rest = url[i+1:]
430 if not rest or any(c not in '0123456789' for c in rest):
431 # not a port number
432 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800433
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000434 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000435 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000436 if (('[' in netloc and ']' not in netloc) or
437 (']' in netloc and '[' not in netloc)):
438 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800439 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000440 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800441 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000442 url, query = url.split('?', 1)
443 v = SplitResult(scheme, netloc, url, query, fragment)
444 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000445 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000446
447def urlunparse(components):
448 """Put a parsed URL back together again. This may result in a
449 slightly different, but equivalent URL, if the URL that was parsed
450 originally had redundant delimiters, e.g. a ? with an empty query
451 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000452 scheme, netloc, url, params, query, fragment, _coerce_result = (
453 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000454 if params:
455 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000456 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000457
458def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000459 """Combine the elements of a tuple as returned by urlsplit() into a
460 complete URL as a string. The data argument can be any five-item iterable.
461 This may result in a slightly different, but equivalent URL, if the URL that
462 was parsed originally had unnecessary delimiters (for example, a ? with an
463 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000464 scheme, netloc, url, query, fragment, _coerce_result = (
465 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000466 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
467 if url and url[:1] != '/': url = '/' + url
468 url = '//' + (netloc or '') + url
469 if scheme:
470 url = scheme + ':' + url
471 if query:
472 url = url + '?' + query
473 if fragment:
474 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000475 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000476
477def urljoin(base, url, allow_fragments=True):
478 """Join a base URL and a possibly relative URL to form an absolute
479 interpretation of the latter."""
480 if not base:
481 return url
482 if not url:
483 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400484
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000485 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000486 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
487 urlparse(base, '', allow_fragments)
488 scheme, netloc, path, params, query, fragment = \
489 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400490
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000491 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000492 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000493 if scheme in uses_netloc:
494 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000495 return _coerce_result(urlunparse((scheme, netloc, path,
496 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000497 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400498
Senthil Kumarandca5b862010-12-17 04:48:45 +0000499 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000500 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000501 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000502 if not query:
503 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000504 return _coerce_result(urlunparse((scheme, netloc, path,
505 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400506
507 base_parts = bpath.split('/')
508 if base_parts[-1] != '':
509 # the last item is not a directory, so will not be taken into account
510 # in resolving the relative path
511 del base_parts[-1]
512
513 # for rfc3986, ignore all base path should the first character be root.
514 if path[:1] == '/':
515 segments = path.split('/')
516 else:
517 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800518 # filter out elements that would cause redundant slashes on re-joining
519 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300520 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400521
522 resolved_path = []
523
524 for seg in segments:
525 if seg == '..':
526 try:
527 resolved_path.pop()
528 except IndexError:
529 # ignore any .. segments that would otherwise cause an IndexError
530 # when popped from resolved_path if resolving for rfc3986
531 pass
532 elif seg == '.':
533 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000534 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400535 resolved_path.append(seg)
536
537 if segments[-1] in ('.', '..'):
538 # do some post-processing here. if the last segment was a relative dir,
539 # then we need to append the trailing '/'
540 resolved_path.append('')
541
542 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800543 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400544
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000545
546def urldefrag(url):
547 """Removes any existing fragment from URL.
548
549 Returns a tuple of the defragmented URL and the fragment. If
550 the URL contained no fragments, the second element is the
551 empty string.
552 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000553 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000554 if '#' in url:
555 s, n, p, a, q, frag = urlparse(url)
556 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000557 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000558 frag = ''
559 defrag = url
560 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000561
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200562_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100563_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200564
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000565def unquote_to_bytes(string):
566 """unquote_to_bytes('abc%20def') -> b'abc def'."""
567 # Note: strings are encoded as UTF-8. This is only an issue if it contains
568 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000569 if not string:
570 # Is it a string-like object?
571 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000572 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000573 if isinstance(string, str):
574 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200575 bits = string.split(b'%')
576 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000577 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200578 res = [bits[0]]
579 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100580 # Delay the initialization of the table to not waste memory
581 # if the function is never called
582 global _hextobyte
583 if _hextobyte is None:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +0200584 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100585 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200586 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000587 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200588 append(_hextobyte[item[:2]])
589 append(item[2:])
590 except KeyError:
591 append(b'%')
592 append(item)
593 return b''.join(res)
594
595_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000596
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000597def unquote(string, encoding='utf-8', errors='replace'):
598 """Replace %xx escapes by their single-character equivalent. The optional
599 encoding and errors parameters specify how to decode percent-encoded
600 sequences into Unicode characters, as accepted by the bytes.decode()
601 method.
602 By default, percent-encoded sequences are decoded with UTF-8, and invalid
603 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000604
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000605 unquote('abc%20def') -> 'abc def'.
606 """
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200607 if '%' not in string:
608 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000609 return string
610 if encoding is None:
611 encoding = 'utf-8'
612 if errors is None:
613 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200614 bits = _asciire.split(string)
615 res = [bits[0]]
616 append = res.append
617 for i in range(1, len(bits), 2):
618 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
619 append(bits[i + 1])
620 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000621
Senthil Kumaran257b9802017-04-04 21:19:43 -0700622
Victor Stinnerac71c542011-01-14 12:52:12 +0000623def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
624 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000625 """Parse a query given as a string argument.
626
627 Arguments:
628
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000629 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000630
631 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000632 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000633 A true value indicates that blanks should be retained as
634 blank strings. The default false value indicates that
635 blank values are to be ignored and treated as if they were
636 not included.
637
638 strict_parsing: flag indicating what to do with parsing errors.
639 If false (the default), errors are silently ignored.
640 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000641
642 encoding and errors: specify how to decode percent-encoded sequences
643 into Unicode characters, as accepted by the bytes.decode() method.
Senthil Kumaran257b9802017-04-04 21:19:43 -0700644
645 Returns a dictionary.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000646 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700647 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000648 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
649 encoding=encoding, errors=errors)
650 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700651 if name in parsed_result:
652 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000653 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700654 parsed_result[name] = [value]
655 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000656
Senthil Kumaran257b9802017-04-04 21:19:43 -0700657
Victor Stinnerac71c542011-01-14 12:52:12 +0000658def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
659 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000660 """Parse a query given as a string argument.
661
Senthil Kumaran257b9802017-04-04 21:19:43 -0700662 Arguments:
Facundo Batistac469d4c2008-09-03 22:49:01 +0000663
Senthil Kumaran257b9802017-04-04 21:19:43 -0700664 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000665
Senthil Kumaran257b9802017-04-04 21:19:43 -0700666 keep_blank_values: flag indicating whether blank values in
667 percent-encoded queries should be treated as blank strings.
668 A true value indicates that blanks should be retained as blank
669 strings. The default false value indicates that blank values
670 are to be ignored and treated as if they were not included.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000671
Senthil Kumaran257b9802017-04-04 21:19:43 -0700672 strict_parsing: flag indicating what to do with parsing errors. If
673 false (the default), errors are silently ignored. If true,
674 errors raise a ValueError exception.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000675
Senthil Kumaran257b9802017-04-04 21:19:43 -0700676 encoding and errors: specify how to decode percent-encoded sequences
677 into Unicode characters, as accepted by the bytes.decode() method.
Victor Stinnerac71c542011-01-14 12:52:12 +0000678
Senthil Kumaran257b9802017-04-04 21:19:43 -0700679 Returns a list, as G-d intended.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000680 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000681 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000682 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
683 r = []
684 for name_value in pairs:
685 if not name_value and not strict_parsing:
686 continue
687 nv = name_value.split('=', 1)
688 if len(nv) != 2:
689 if strict_parsing:
690 raise ValueError("bad query field: %r" % (name_value,))
691 # Handle case of a control-name with no equal sign
692 if keep_blank_values:
693 nv.append('')
694 else:
695 continue
696 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000697 name = nv[0].replace('+', ' ')
698 name = unquote(name, encoding=encoding, errors=errors)
699 name = _coerce_result(name)
700 value = nv[1].replace('+', ' ')
701 value = unquote(value, encoding=encoding, errors=errors)
702 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000703 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000704 return r
705
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000706def unquote_plus(string, encoding='utf-8', errors='replace'):
707 """Like unquote(), but also replace plus signs by spaces, as required for
708 unquoting HTML form values.
709
710 unquote_plus('%7e/abc+def') -> '~/abc def'
711 """
712 string = string.replace('+', ' ')
713 return unquote(string, encoding, errors)
714
715_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
716 b'abcdefghijklmnopqrstuvwxyz'
717 b'0123456789'
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530718 b'_.-~')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000719_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
720_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000721
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000722class Quoter(collections.defaultdict):
723 """A mapping from bytes (in range(0,256)) to strings.
724
725 String values are percent-encoded byte values, unless the key < 128, and
726 in the "safe" set (either the specified safe set, or default set).
727 """
728 # Keeps a cache internally, using defaultdict, for efficiency (lookups
729 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000730 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000731 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000732 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000733
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000734 def __repr__(self):
735 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300736 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000737
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000738 def __missing__(self, b):
739 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000740 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000741 self[b] = res
742 return res
743
744def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000745 """quote('abc def') -> 'abc%20def'
746
747 Each part of a URL, e.g. the path info, the query, etc., has a
748 different set of reserved characters that must be quoted.
749
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530750 RFC 3986 Uniform Resource Identifiers (URI): Generic Syntax lists
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000751 the following reserved characters.
752
753 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530754 "$" | "," | "~"
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000755
756 Each of these characters is reserved in some component of a URL,
757 but not necessarily in all of them.
758
Ratnadeep Debnath21024f02017-02-25 14:30:28 +0530759 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
760 Now, "~" is included in the set of reserved characters.
761
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000762 By default, the quote function is intended for quoting the path
763 section of a URL. Thus, it will not encode '/'. This character
764 is reserved, but in typical usage the quote function is being
765 called on a path where the existing slash characters are used as
766 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000767
R David Murray8c4e1122014-12-24 21:23:18 -0500768 string and safe may be either str or bytes objects. encoding and errors
769 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000770
771 The optional encoding and errors parameters specify how to deal with
772 non-ASCII characters, as accepted by the str.encode method.
773 By default, encoding='utf-8' (characters are encoded with UTF-8), and
774 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000775 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000776 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000777 if not string:
778 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000779 if encoding is None:
780 encoding = 'utf-8'
781 if errors is None:
782 errors = 'strict'
783 string = string.encode(encoding, errors)
784 else:
785 if encoding is not None:
786 raise TypeError("quote() doesn't support 'encoding' for bytes")
787 if errors is not None:
788 raise TypeError("quote() doesn't support 'errors' for bytes")
789 return quote_from_bytes(string, safe)
790
791def quote_plus(string, safe='', encoding=None, errors=None):
792 """Like quote(), but also replace ' ' with '+', as required for quoting
793 HTML form values. Plus signs in the original string are escaped unless
794 they are included in safe. It also does not have safe default to '/'.
795 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000796 # Check if ' ' in string, where string may either be a str or bytes. If
797 # there are no spaces, the regular quote will produce the right answer.
798 if ((isinstance(string, str) and ' ' not in string) or
799 (isinstance(string, bytes) and b' ' not in string)):
800 return quote(string, safe, encoding, errors)
801 if isinstance(safe, str):
802 space = ' '
803 else:
804 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000805 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000806 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000807
808def quote_from_bytes(bs, safe='/'):
809 """Like quote(), but accepts a bytes object rather than a str, and does
810 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800811 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000812 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000813 if not isinstance(bs, (bytes, bytearray)):
814 raise TypeError("quote_from_bytes() expected bytes")
815 if not bs:
816 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000817 if isinstance(safe, str):
818 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
819 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000820 else:
821 safe = bytes([c for c in safe if c < 128])
822 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
823 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000824 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000825 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000826 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000827 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
828 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000829
R David Murrayc17686f2015-05-17 20:44:50 -0400830def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
831 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700832 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000833
834 If any values in the query arg are sequences and doseq is true, each
835 sequence element is converted to a separate parameter.
836
837 If the query arg is a sequence of two-element tuples, the order of the
838 parameters in the output will match the order of parameters in the
839 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000840
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700841 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500842
R David Murrayc17686f2015-05-17 20:44:50 -0400843 The safe, encoding, and errors parameters are passed down to the function
844 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000845 """
846
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000847 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000848 query = query.items()
849 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000850 # It's a bother at times that strings and string-like objects are
851 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000852 try:
853 # non-sequence items should not work with len()
854 # non-empty strings will fail this
855 if len(query) and not isinstance(query[0], tuple):
856 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000857 # Zero-length sequences of all types will get here and succeed,
858 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000859 # allowed empty dicts that type of behavior probably should be
860 # preserved for consistency
861 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000862 ty, va, tb = sys.exc_info()
863 raise TypeError("not a valid non-string sequence "
864 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000865
866 l = []
867 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000868 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000869 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400870 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000871 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400872 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000873
874 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400875 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000876 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400877 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000878 l.append(k + '=' + v)
879 else:
880 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000881 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400882 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000883 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400884 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000885
886 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400887 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000888 l.append(k + '=' + v)
889 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400890 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000891 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000892 else:
893 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000894 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000895 x = len(v)
896 except TypeError:
897 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400898 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000899 l.append(k + '=' + v)
900 else:
901 # loop over the sequence
902 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000903 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400904 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000905 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400906 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000907 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000908 return '&'.join(l)
909
Georg Brandl13e89462008-07-01 19:56:00 +0000910def to_bytes(url):
911 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000912 # Most URL schemes require ASCII. If that changes, the conversion
913 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000914 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000915 if isinstance(url, str):
916 try:
917 url = url.encode("ASCII").decode()
918 except UnicodeError:
919 raise UnicodeError("URL " + repr(url) +
920 " contains non-ASCII characters")
921 return url
922
923def unwrap(url):
924 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
925 url = str(url).strip()
926 if url[:1] == '<' and url[-1:] == '>':
927 url = url[1:-1].strip()
928 if url[:4] == 'URL:': url = url[4:].strip()
929 return url
930
931_typeprog = None
932def splittype(url):
933 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
934 global _typeprog
935 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200936 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000937
938 match = _typeprog.match(url)
939 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200940 scheme, data = match.groups()
941 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000942 return None, url
943
944_hostprog = None
945def splithost(url):
946 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
947 global _hostprog
948 if _hostprog is None:
postmasters90e01e52017-06-20 06:02:44 -0700949 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000950
951 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000952 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200953 host_port, path = match.groups()
954 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +0000955 path = '/' + path
956 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000957 return None, url
958
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000959def splituser(host):
960 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200961 user, delim, host = host.rpartition('@')
962 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000963
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000964def splitpasswd(user):
965 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200966 user, delim, passwd = user.partition(':')
967 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000968
969# splittag('/path#tag') --> '/path', 'tag'
970_portprog = None
971def splitport(host):
972 """splitport('host:port') --> 'host', 'port'."""
973 global _portprog
974 if _portprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200975 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000976
977 match = _portprog.match(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200978 if match:
979 host, port = match.groups()
980 if port:
981 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000982 return host, None
983
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000984def splitnport(host, defport=-1):
985 """Split host and port, returning numeric port.
986 Return given default port if no ':' found; defaults to -1.
987 Return numerical port if a valid number are found after ':'.
988 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200989 host, delim, port = host.rpartition(':')
990 if not delim:
991 host = port
992 elif port:
993 try:
994 nport = int(port)
995 except ValueError:
996 nport = None
997 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000998 return host, defport
999
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001000def splitquery(url):
1001 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001002 path, delim, query = url.rpartition('?')
1003 if delim:
1004 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001005 return url, None
1006
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001007def splittag(url):
1008 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001009 path, delim, tag = url.rpartition('#')
1010 if delim:
1011 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001012 return url, None
1013
1014def splitattr(url):
1015 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1016 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1017 words = url.split(';')
1018 return words[0], words[1:]
1019
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001020def splitvalue(attr):
1021 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001022 attr, delim, value = attr.partition('=')
1023 return attr, (value if delim else None)