blob: 99a6977ebff69a805b5a065b2fa819388edfeb3d [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000033
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000035 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030037 "unquote", "unquote_plus", "unquote_to_bytes",
38 "DefragResult", "ParseResult", "SplitResult",
39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000040
41# A classification of schemes ('' means apply by default)
42uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
43 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran2a157d22011-08-03 18:37:22 +080044 'prospero', 'rtsp', 'rtspu', '', 'sftp',
45 'svn', 'svn+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
47 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
48 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000049 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
51 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaraned301992012-12-24 14:00:20 -080052 'mms', '', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053
Georg Brandla61b09f2012-08-24 18:15:29 +020054# These are not actually used anymore, but should stay for backwards
55# compatibility. (They are undocumented, but have a public-looking name.)
56non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
57 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
58uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
59 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
60uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
61 'nntp', 'wais', 'https', 'shttp', 'snews',
62 'file', 'prospero', '']
63
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064# Characters valid in scheme names
65scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
66 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
67 '0123456789'
68 '+-.')
69
Nick Coghlan9fc443c2010-11-30 15:48:08 +000070# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000071MAX_CACHE_SIZE = 20
72_parse_cache = {}
73
74def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000075 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000076 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000077 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000078
79
Nick Coghlan9fc443c2010-11-30 15:48:08 +000080# Helpers for bytes handling
81# For 3.2, we deliberately require applications that
82# handle improperly quoted URLs to do their own
83# decoding and encoding. If valid use cases are
84# presented, we may relax this by using latin-1
85# decoding internally for 3.3
86_implicit_encoding = 'ascii'
87_implicit_errors = 'strict'
88
89def _noop(obj):
90 return obj
91
92def _encode_result(obj, encoding=_implicit_encoding,
93 errors=_implicit_errors):
94 return obj.encode(encoding, errors)
95
96def _decode_args(args, encoding=_implicit_encoding,
97 errors=_implicit_errors):
98 return tuple(x.decode(encoding, errors) if x else '' for x in args)
99
100def _coerce_args(*args):
101 # Invokes decode if necessary to create str args
102 # and returns the coerced inputs along with
103 # an appropriate result coercion function
104 # - noop for str inputs
105 # - encoding function otherwise
106 str_input = isinstance(args[0], str)
107 for arg in args[1:]:
108 # We special-case the empty string to support the
109 # "scheme=''" default argument to some functions
110 if arg and isinstance(arg, str) != str_input:
111 raise TypeError("Cannot mix str and non-str arguments")
112 if str_input:
113 return args + (_noop,)
114 return _decode_args(args) + (_encode_result,)
115
116# Result objects are more helpful than simple tuples
117class _ResultMixinStr(object):
118 """Standard approach to encoding parsed results from str to bytes"""
119 __slots__ = ()
120
121 def encode(self, encoding='ascii', errors='strict'):
122 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
123
124
125class _ResultMixinBytes(object):
126 """Standard approach to decoding parsed results from bytes to str"""
127 __slots__ = ()
128
129 def decode(self, encoding='ascii', errors='strict'):
130 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
131
132
133class _NetlocResultMixinBase(object):
134 """Shared methods for the parsed result objects containing a netloc element"""
135 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000136
137 @property
138 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000139 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000140
141 @property
142 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000143 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000144
145 @property
146 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000147 hostname = self._hostinfo[0]
148 if not hostname:
149 hostname = None
150 elif hostname is not None:
151 hostname = hostname.lower()
152 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000153
154 @property
155 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000156 port = self._hostinfo[1]
157 if port is not None:
158 port = int(port, 10)
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800159 if not ( 0 <= port <= 65535):
Robert Collinsdfa95c92015-08-10 09:53:30 +1200160 raise ValueError("Port out of range 0-65535")
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000161 return port
162
163
164class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
165 __slots__ = ()
166
167 @property
168 def _userinfo(self):
169 netloc = self.netloc
170 userinfo, have_info, hostinfo = netloc.rpartition('@')
171 if have_info:
172 username, have_password, password = userinfo.partition(':')
173 if not have_password:
174 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000175 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000176 username = password = None
177 return username, password
178
179 @property
180 def _hostinfo(self):
181 netloc = self.netloc
182 _, _, hostinfo = netloc.rpartition('@')
183 _, have_open_br, bracketed = hostinfo.partition('[')
184 if have_open_br:
185 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200186 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000187 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200188 hostname, _, port = hostinfo.partition(':')
189 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000190 port = None
191 return hostname, port
192
193
194class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
195 __slots__ = ()
196
197 @property
198 def _userinfo(self):
199 netloc = self.netloc
200 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
201 if have_info:
202 username, have_password, password = userinfo.partition(b':')
203 if not have_password:
204 password = None
205 else:
206 username = password = None
207 return username, password
208
209 @property
210 def _hostinfo(self):
211 netloc = self.netloc
212 _, _, hostinfo = netloc.rpartition(b'@')
213 _, have_open_br, bracketed = hostinfo.partition(b'[')
214 if have_open_br:
215 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200216 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000217 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200218 hostname, _, port = hostinfo.partition(b':')
219 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000220 port = None
221 return hostname, port
222
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000223
224from collections import namedtuple
225
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000226_DefragResultBase = namedtuple('DefragResult', 'url fragment')
Senthil Kumaran86f71092016-01-14 00:11:39 -0800227_SplitResultBase = namedtuple(
228 'SplitResult', 'scheme netloc path query fragment')
229_ParseResultBase = namedtuple(
230 'ParseResult', 'scheme netloc path params query fragment')
231
232_DefragResultBase.__doc__ = """
233DefragResult(url, fragment)
234
235A 2-tuple that contains the url without fragment identifier and the fragment
236identifier as a separate argument.
237"""
238
239_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
240
241_DefragResultBase.fragment.__doc__ = """
242Fragment identifier separated from URL, that allows indirect identification of a
243secondary resource by reference to a primary resource and additional identifying
244information.
245"""
246
247_SplitResultBase.__doc__ = """
248SplitResult(scheme, netloc, path, query, fragment)
249
250A 5-tuple that contains the different components of a URL. Similar to
251ParseResult, but does not split params.
252"""
253
254_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
255
256_SplitResultBase.netloc.__doc__ = """
257Network location where the request is made to.
258"""
259
260_SplitResultBase.path.__doc__ = """
261The hierarchical path, such as the path to a file to download.
262"""
263
264_SplitResultBase.query.__doc__ = """
265The query component, that contains non-hierarchical data, that along with data
266in path component, identifies a resource in the scope of URI's scheme and
267network location.
268"""
269
270_SplitResultBase.fragment.__doc__ = """
271Fragment identifier, that allows indirect identification of a secondary resource
272by reference to a primary resource and additional identifying information.
273"""
274
275_ParseResultBase.__doc__ = """
276ParseResult(scheme, netloc, path, params, query, fragment)
277
278A 6-tuple that contains components of a parsed URL.
279"""
280
281_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
282_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
283_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
284_ParseResultBase.params.__doc__ = """
285Parameters for last path element used to dereference the URI in order to provide
286access to perform some operation on the resource.
287"""
288
289_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
290_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
291
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000292
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000293# For backwards compatibility, alias _NetlocResultMixinStr
294# ResultBase is no longer part of the documented API, but it is
295# retained since deprecating it isn't worth the hassle
296ResultBase = _NetlocResultMixinStr
297
298# Structured result objects for string data
299class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000300 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000301 def geturl(self):
302 if self.fragment:
303 return self.url + '#' + self.fragment
304 else:
305 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000306
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000307class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
308 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000309 def geturl(self):
310 return urlunsplit(self)
311
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000312class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000313 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000314 def geturl(self):
315 return urlunparse(self)
316
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000317# Structured result objects for bytes data
318class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
319 __slots__ = ()
320 def geturl(self):
321 if self.fragment:
322 return self.url + b'#' + self.fragment
323 else:
324 return self.url
325
326class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
327 __slots__ = ()
328 def geturl(self):
329 return urlunsplit(self)
330
331class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
332 __slots__ = ()
333 def geturl(self):
334 return urlunparse(self)
335
336# Set up the encode/decode result pairs
337def _fix_result_transcoding():
338 _result_pairs = (
339 (DefragResult, DefragResultBytes),
340 (SplitResult, SplitResultBytes),
341 (ParseResult, ParseResultBytes),
342 )
343 for _decoded, _encoded in _result_pairs:
344 _decoded._encoded_counterpart = _encoded
345 _encoded._decoded_counterpart = _decoded
346
347_fix_result_transcoding()
348del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000349
350def urlparse(url, scheme='', allow_fragments=True):
351 """Parse a URL into 6 components:
352 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
353 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
354 Note that we don't break the components up in smaller bits
355 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000356 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700357 splitresult = urlsplit(url, scheme, allow_fragments)
358 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359 if scheme in uses_params and ';' in url:
360 url, params = _splitparams(url)
361 else:
362 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000363 result = ParseResult(scheme, netloc, url, params, query, fragment)
364 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000365
366def _splitparams(url):
367 if '/' in url:
368 i = url.find(';', url.rfind('/'))
369 if i < 0:
370 return url, ''
371 else:
372 i = url.find(';')
373 return url[:i], url[i+1:]
374
375def _splitnetloc(url, start=0):
376 delim = len(url) # position of end of domain part of url, default is end
377 for c in '/?#': # look for delimiters; the order is NOT important
378 wdelim = url.find(c, start) # find first of this delim
379 if wdelim >= 0: # if found
380 delim = min(delim, wdelim) # use earliest delim position
381 return url[start:delim], url[delim:] # return (domain, rest)
382
383def urlsplit(url, scheme='', allow_fragments=True):
384 """Parse a URL into 5 components:
385 <scheme>://<netloc>/<path>?<query>#<fragment>
386 Return a 5-tuple: (scheme, netloc, path, query, fragment).
387 Note that we don't break the components up in smaller bits
388 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000389 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000390 allow_fragments = bool(allow_fragments)
391 key = url, scheme, allow_fragments, type(url), type(scheme)
392 cached = _parse_cache.get(key, None)
393 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000394 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000395 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
396 clear_cache()
397 netloc = query = fragment = ''
398 i = url.find(':')
399 if i > 0:
400 if url[:i] == 'http': # optimize the common case
401 scheme = url[:i].lower()
402 url = url[i+1:]
403 if url[:2] == '//':
404 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000405 if (('[' in netloc and ']' not in netloc) or
406 (']' in netloc and '[' not in netloc)):
407 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000408 if allow_fragments and '#' in url:
409 url, fragment = url.split('#', 1)
410 if '?' in url:
411 url, query = url.split('?', 1)
412 v = SplitResult(scheme, netloc, url, query, fragment)
413 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000414 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800415 for c in url[:i]:
416 if c not in scheme_chars:
417 break
418 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300419 # make sure "url" is not actually a port number (in which case
420 # "scheme" is really part of the path)
421 rest = url[i+1:]
422 if not rest or any(c not in '0123456789' for c in rest):
423 # not a port number
424 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800425
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000426 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000427 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000428 if (('[' in netloc and ']' not in netloc) or
429 (']' in netloc and '[' not in netloc)):
430 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800431 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000432 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800433 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000434 url, query = url.split('?', 1)
435 v = SplitResult(scheme, netloc, url, query, fragment)
436 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000437 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000438
439def urlunparse(components):
440 """Put a parsed URL back together again. This may result in a
441 slightly different, but equivalent URL, if the URL that was parsed
442 originally had redundant delimiters, e.g. a ? with an empty query
443 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000444 scheme, netloc, url, params, query, fragment, _coerce_result = (
445 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000446 if params:
447 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000448 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000449
450def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000451 """Combine the elements of a tuple as returned by urlsplit() into a
452 complete URL as a string. The data argument can be any five-item iterable.
453 This may result in a slightly different, but equivalent URL, if the URL that
454 was parsed originally had unnecessary delimiters (for example, a ? with an
455 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000456 scheme, netloc, url, query, fragment, _coerce_result = (
457 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000458 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
459 if url and url[:1] != '/': url = '/' + url
460 url = '//' + (netloc or '') + url
461 if scheme:
462 url = scheme + ':' + url
463 if query:
464 url = url + '?' + query
465 if fragment:
466 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000467 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000468
469def urljoin(base, url, allow_fragments=True):
470 """Join a base URL and a possibly relative URL to form an absolute
471 interpretation of the latter."""
472 if not base:
473 return url
474 if not url:
475 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400476
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000477 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000478 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
479 urlparse(base, '', allow_fragments)
480 scheme, netloc, path, params, query, fragment = \
481 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400482
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000483 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000484 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000485 if scheme in uses_netloc:
486 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000487 return _coerce_result(urlunparse((scheme, netloc, path,
488 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000489 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400490
Senthil Kumarandca5b862010-12-17 04:48:45 +0000491 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000492 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000493 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000494 if not query:
495 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000496 return _coerce_result(urlunparse((scheme, netloc, path,
497 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400498
499 base_parts = bpath.split('/')
500 if base_parts[-1] != '':
501 # the last item is not a directory, so will not be taken into account
502 # in resolving the relative path
503 del base_parts[-1]
504
505 # for rfc3986, ignore all base path should the first character be root.
506 if path[:1] == '/':
507 segments = path.split('/')
508 else:
509 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800510 # filter out elements that would cause redundant slashes on re-joining
511 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300512 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400513
514 resolved_path = []
515
516 for seg in segments:
517 if seg == '..':
518 try:
519 resolved_path.pop()
520 except IndexError:
521 # ignore any .. segments that would otherwise cause an IndexError
522 # when popped from resolved_path if resolving for rfc3986
523 pass
524 elif seg == '.':
525 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000526 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400527 resolved_path.append(seg)
528
529 if segments[-1] in ('.', '..'):
530 # do some post-processing here. if the last segment was a relative dir,
531 # then we need to append the trailing '/'
532 resolved_path.append('')
533
534 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800535 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400536
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000537
538def urldefrag(url):
539 """Removes any existing fragment from URL.
540
541 Returns a tuple of the defragmented URL and the fragment. If
542 the URL contained no fragments, the second element is the
543 empty string.
544 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000545 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000546 if '#' in url:
547 s, n, p, a, q, frag = urlparse(url)
548 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000549 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000550 frag = ''
551 defrag = url
552 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000553
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200554_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100555_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200556
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000557def unquote_to_bytes(string):
558 """unquote_to_bytes('abc%20def') -> b'abc def'."""
559 # Note: strings are encoded as UTF-8. This is only an issue if it contains
560 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000561 if not string:
562 # Is it a string-like object?
563 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000564 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000565 if isinstance(string, str):
566 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200567 bits = string.split(b'%')
568 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000569 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200570 res = [bits[0]]
571 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100572 # Delay the initialization of the table to not waste memory
573 # if the function is never called
574 global _hextobyte
575 if _hextobyte is None:
576 _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
577 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200578 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000579 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200580 append(_hextobyte[item[:2]])
581 append(item[2:])
582 except KeyError:
583 append(b'%')
584 append(item)
585 return b''.join(res)
586
587_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000588
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000589def unquote(string, encoding='utf-8', errors='replace'):
590 """Replace %xx escapes by their single-character equivalent. The optional
591 encoding and errors parameters specify how to decode percent-encoded
592 sequences into Unicode characters, as accepted by the bytes.decode()
593 method.
594 By default, percent-encoded sequences are decoded with UTF-8, and invalid
595 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000596
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000597 unquote('abc%20def') -> 'abc def'.
598 """
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200599 if '%' not in string:
600 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000601 return string
602 if encoding is None:
603 encoding = 'utf-8'
604 if errors is None:
605 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200606 bits = _asciire.split(string)
607 res = [bits[0]]
608 append = res.append
609 for i in range(1, len(bits), 2):
610 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
611 append(bits[i + 1])
612 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000613
Victor Stinnerac71c542011-01-14 12:52:12 +0000614def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
615 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000616 """Parse a query given as a string argument.
617
618 Arguments:
619
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000620 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000621
622 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000623 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000624 A true value indicates that blanks should be retained as
625 blank strings. The default false value indicates that
626 blank values are to be ignored and treated as if they were
627 not included.
628
629 strict_parsing: flag indicating what to do with parsing errors.
630 If false (the default), errors are silently ignored.
631 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000632
633 encoding and errors: specify how to decode percent-encoded sequences
634 into Unicode characters, as accepted by the bytes.decode() method.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000635 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700636 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000637 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
638 encoding=encoding, errors=errors)
639 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700640 if name in parsed_result:
641 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000642 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700643 parsed_result[name] = [value]
644 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000645
Victor Stinnerac71c542011-01-14 12:52:12 +0000646def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
647 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000648 """Parse a query given as a string argument.
649
650 Arguments:
651
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000652 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000653
654 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000655 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000656 true value indicates that blanks should be retained as blank
657 strings. The default false value indicates that blank values
658 are to be ignored and treated as if they were not included.
659
660 strict_parsing: flag indicating what to do with parsing errors. If
661 false (the default), errors are silently ignored. If true,
662 errors raise a ValueError exception.
663
Victor Stinnerac71c542011-01-14 12:52:12 +0000664 encoding and errors: specify how to decode percent-encoded sequences
665 into Unicode characters, as accepted by the bytes.decode() method.
666
Facundo Batistac469d4c2008-09-03 22:49:01 +0000667 Returns a list, as G-d intended.
668 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000669 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000670 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
671 r = []
672 for name_value in pairs:
673 if not name_value and not strict_parsing:
674 continue
675 nv = name_value.split('=', 1)
676 if len(nv) != 2:
677 if strict_parsing:
678 raise ValueError("bad query field: %r" % (name_value,))
679 # Handle case of a control-name with no equal sign
680 if keep_blank_values:
681 nv.append('')
682 else:
683 continue
684 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000685 name = nv[0].replace('+', ' ')
686 name = unquote(name, encoding=encoding, errors=errors)
687 name = _coerce_result(name)
688 value = nv[1].replace('+', ' ')
689 value = unquote(value, encoding=encoding, errors=errors)
690 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000691 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000692 return r
693
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000694def unquote_plus(string, encoding='utf-8', errors='replace'):
695 """Like unquote(), but also replace plus signs by spaces, as required for
696 unquoting HTML form values.
697
698 unquote_plus('%7e/abc+def') -> '~/abc def'
699 """
700 string = string.replace('+', ' ')
701 return unquote(string, encoding, errors)
702
703_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
704 b'abcdefghijklmnopqrstuvwxyz'
705 b'0123456789'
706 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000707_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
708_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000709
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000710class Quoter(collections.defaultdict):
711 """A mapping from bytes (in range(0,256)) to strings.
712
713 String values are percent-encoded byte values, unless the key < 128, and
714 in the "safe" set (either the specified safe set, or default set).
715 """
716 # Keeps a cache internally, using defaultdict, for efficiency (lookups
717 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000718 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000719 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000720 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000721
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000722 def __repr__(self):
723 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300724 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000725
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000726 def __missing__(self, b):
727 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000728 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000729 self[b] = res
730 return res
731
732def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000733 """quote('abc def') -> 'abc%20def'
734
735 Each part of a URL, e.g. the path info, the query, etc., has a
736 different set of reserved characters that must be quoted.
737
738 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
739 the following reserved characters.
740
741 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
742 "$" | ","
743
744 Each of these characters is reserved in some component of a URL,
745 but not necessarily in all of them.
746
747 By default, the quote function is intended for quoting the path
748 section of a URL. Thus, it will not encode '/'. This character
749 is reserved, but in typical usage the quote function is being
750 called on a path where the existing slash characters are used as
751 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000752
R David Murray8c4e1122014-12-24 21:23:18 -0500753 string and safe may be either str or bytes objects. encoding and errors
754 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000755
756 The optional encoding and errors parameters specify how to deal with
757 non-ASCII characters, as accepted by the str.encode method.
758 By default, encoding='utf-8' (characters are encoded with UTF-8), and
759 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000760 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000761 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000762 if not string:
763 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000764 if encoding is None:
765 encoding = 'utf-8'
766 if errors is None:
767 errors = 'strict'
768 string = string.encode(encoding, errors)
769 else:
770 if encoding is not None:
771 raise TypeError("quote() doesn't support 'encoding' for bytes")
772 if errors is not None:
773 raise TypeError("quote() doesn't support 'errors' for bytes")
774 return quote_from_bytes(string, safe)
775
776def quote_plus(string, safe='', encoding=None, errors=None):
777 """Like quote(), but also replace ' ' with '+', as required for quoting
778 HTML form values. Plus signs in the original string are escaped unless
779 they are included in safe. It also does not have safe default to '/'.
780 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000781 # Check if ' ' in string, where string may either be a str or bytes. If
782 # there are no spaces, the regular quote will produce the right answer.
783 if ((isinstance(string, str) and ' ' not in string) or
784 (isinstance(string, bytes) and b' ' not in string)):
785 return quote(string, safe, encoding, errors)
786 if isinstance(safe, str):
787 space = ' '
788 else:
789 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000790 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000791 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000792
793def quote_from_bytes(bs, safe='/'):
794 """Like quote(), but accepts a bytes object rather than a str, and does
795 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800796 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000797 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000798 if not isinstance(bs, (bytes, bytearray)):
799 raise TypeError("quote_from_bytes() expected bytes")
800 if not bs:
801 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000802 if isinstance(safe, str):
803 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
804 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000805 else:
806 safe = bytes([c for c in safe if c < 128])
807 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
808 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000809 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000810 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000811 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000812 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
813 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000814
R David Murrayc17686f2015-05-17 20:44:50 -0400815def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
816 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700817 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000818
819 If any values in the query arg are sequences and doseq is true, each
820 sequence element is converted to a separate parameter.
821
822 If the query arg is a sequence of two-element tuples, the order of the
823 parameters in the output will match the order of parameters in the
824 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000825
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700826 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500827
R David Murrayc17686f2015-05-17 20:44:50 -0400828 The safe, encoding, and errors parameters are passed down to the function
829 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000830 """
831
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000832 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000833 query = query.items()
834 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000835 # It's a bother at times that strings and string-like objects are
836 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000837 try:
838 # non-sequence items should not work with len()
839 # non-empty strings will fail this
840 if len(query) and not isinstance(query[0], tuple):
841 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000842 # Zero-length sequences of all types will get here and succeed,
843 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000844 # allowed empty dicts that type of behavior probably should be
845 # preserved for consistency
846 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000847 ty, va, tb = sys.exc_info()
848 raise TypeError("not a valid non-string sequence "
849 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000850
851 l = []
852 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000853 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000854 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400855 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000856 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400857 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000858
859 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400860 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000861 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400862 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000863 l.append(k + '=' + v)
864 else:
865 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000866 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400867 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000868 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400869 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000870
871 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400872 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000873 l.append(k + '=' + v)
874 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400875 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000876 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000877 else:
878 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000879 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000880 x = len(v)
881 except TypeError:
882 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400883 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000884 l.append(k + '=' + v)
885 else:
886 # loop over the sequence
887 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000888 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400889 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000890 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400891 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000892 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000893 return '&'.join(l)
894
Georg Brandl13e89462008-07-01 19:56:00 +0000895def to_bytes(url):
896 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000897 # Most URL schemes require ASCII. If that changes, the conversion
898 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000899 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000900 if isinstance(url, str):
901 try:
902 url = url.encode("ASCII").decode()
903 except UnicodeError:
904 raise UnicodeError("URL " + repr(url) +
905 " contains non-ASCII characters")
906 return url
907
908def unwrap(url):
909 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
910 url = str(url).strip()
911 if url[:1] == '<' and url[-1:] == '>':
912 url = url[1:-1].strip()
913 if url[:4] == 'URL:': url = url[4:].strip()
914 return url
915
916_typeprog = None
917def splittype(url):
918 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
919 global _typeprog
920 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200921 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000922
923 match = _typeprog.match(url)
924 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200925 scheme, data = match.groups()
926 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000927 return None, url
928
929_hostprog = None
930def splithost(url):
931 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
932 global _hostprog
933 if _hostprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200934 _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000935
936 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000937 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200938 host_port, path = match.groups()
939 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +0000940 path = '/' + path
941 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000942 return None, url
943
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000944def splituser(host):
945 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200946 user, delim, host = host.rpartition('@')
947 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000948
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000949def splitpasswd(user):
950 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200951 user, delim, passwd = user.partition(':')
952 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000953
954# splittag('/path#tag') --> '/path', 'tag'
955_portprog = None
956def splitport(host):
957 """splitport('host:port') --> 'host', 'port'."""
958 global _portprog
959 if _portprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200960 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000961
962 match = _portprog.match(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200963 if match:
964 host, port = match.groups()
965 if port:
966 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000967 return host, None
968
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000969def splitnport(host, defport=-1):
970 """Split host and port, returning numeric port.
971 Return given default port if no ':' found; defaults to -1.
972 Return numerical port if a valid number are found after ':'.
973 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200974 host, delim, port = host.rpartition(':')
975 if not delim:
976 host = port
977 elif port:
978 try:
979 nport = int(port)
980 except ValueError:
981 nport = None
982 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000983 return host, defport
984
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000985def splitquery(url):
986 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200987 path, delim, query = url.rpartition('?')
988 if delim:
989 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000990 return url, None
991
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000992def splittag(url):
993 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200994 path, delim, tag = url.rpartition('#')
995 if delim:
996 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000997 return url, None
998
999def splitattr(url):
1000 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1001 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1002 words = url.split(';')
1003 return words[0], words[1:]
1004
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001005def splitvalue(attr):
1006 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001007 attr, delim, value = attr.partition('=')
1008 return attr, (value if delim else None)