blob: 958767a08d7e77eeffc63217695c10877acd98b7 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000033
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000035 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030037 "unquote", "unquote_plus", "unquote_to_bytes",
38 "DefragResult", "ParseResult", "SplitResult",
39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000040
41# A classification of schemes ('' means apply by default)
42uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
43 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran2a157d22011-08-03 18:37:22 +080044 'prospero', 'rtsp', 'rtspu', '', 'sftp',
Berker Peksagf6767482016-09-16 14:43:58 +030045 'svn', 'svn+ssh', 'ws', 'wss']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
47 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
48 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Berker Peksagf6767482016-09-16 14:43:58 +030049 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
50 'ws', 'wss']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
52 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaraned301992012-12-24 14:00:20 -080053 'mms', '', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000054
Georg Brandla61b09f2012-08-24 18:15:29 +020055# These are not actually used anymore, but should stay for backwards
56# compatibility. (They are undocumented, but have a public-looking name.)
57non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
58 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
59uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
60 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
61uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
62 'nntp', 'wais', 'https', 'shttp', 'snews',
63 'file', 'prospero', '']
64
Jeremy Hylton1afc1692008-06-18 20:49:58 +000065# Characters valid in scheme names
66scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
67 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
68 '0123456789'
69 '+-.')
70
Nick Coghlan9fc443c2010-11-30 15:48:08 +000071# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000072MAX_CACHE_SIZE = 20
73_parse_cache = {}
74
75def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000076 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000077 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000078 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000079
80
Nick Coghlan9fc443c2010-11-30 15:48:08 +000081# Helpers for bytes handling
82# For 3.2, we deliberately require applications that
83# handle improperly quoted URLs to do their own
84# decoding and encoding. If valid use cases are
85# presented, we may relax this by using latin-1
86# decoding internally for 3.3
87_implicit_encoding = 'ascii'
88_implicit_errors = 'strict'
89
90def _noop(obj):
91 return obj
92
93def _encode_result(obj, encoding=_implicit_encoding,
94 errors=_implicit_errors):
95 return obj.encode(encoding, errors)
96
97def _decode_args(args, encoding=_implicit_encoding,
98 errors=_implicit_errors):
99 return tuple(x.decode(encoding, errors) if x else '' for x in args)
100
101def _coerce_args(*args):
102 # Invokes decode if necessary to create str args
103 # and returns the coerced inputs along with
104 # an appropriate result coercion function
105 # - noop for str inputs
106 # - encoding function otherwise
107 str_input = isinstance(args[0], str)
108 for arg in args[1:]:
109 # We special-case the empty string to support the
110 # "scheme=''" default argument to some functions
111 if arg and isinstance(arg, str) != str_input:
112 raise TypeError("Cannot mix str and non-str arguments")
113 if str_input:
114 return args + (_noop,)
115 return _decode_args(args) + (_encode_result,)
116
117# Result objects are more helpful than simple tuples
118class _ResultMixinStr(object):
119 """Standard approach to encoding parsed results from str to bytes"""
120 __slots__ = ()
121
122 def encode(self, encoding='ascii', errors='strict'):
123 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
124
125
126class _ResultMixinBytes(object):
127 """Standard approach to decoding parsed results from bytes to str"""
128 __slots__ = ()
129
130 def decode(self, encoding='ascii', errors='strict'):
131 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
132
133
134class _NetlocResultMixinBase(object):
135 """Shared methods for the parsed result objects containing a netloc element"""
136 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000137
138 @property
139 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000140 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000141
142 @property
143 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000144 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000145
146 @property
147 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000148 hostname = self._hostinfo[0]
149 if not hostname:
150 hostname = None
151 elif hostname is not None:
152 hostname = hostname.lower()
153 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000154
155 @property
156 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000157 port = self._hostinfo[1]
158 if port is not None:
159 port = int(port, 10)
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800160 if not ( 0 <= port <= 65535):
Robert Collinsdfa95c92015-08-10 09:53:30 +1200161 raise ValueError("Port out of range 0-65535")
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000162 return port
163
164
165class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
166 __slots__ = ()
167
168 @property
169 def _userinfo(self):
170 netloc = self.netloc
171 userinfo, have_info, hostinfo = netloc.rpartition('@')
172 if have_info:
173 username, have_password, password = userinfo.partition(':')
174 if not have_password:
175 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000176 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000177 username = password = None
178 return username, password
179
180 @property
181 def _hostinfo(self):
182 netloc = self.netloc
183 _, _, hostinfo = netloc.rpartition('@')
184 _, have_open_br, bracketed = hostinfo.partition('[')
185 if have_open_br:
186 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200187 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000188 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200189 hostname, _, port = hostinfo.partition(':')
190 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000191 port = None
192 return hostname, port
193
194
195class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
196 __slots__ = ()
197
198 @property
199 def _userinfo(self):
200 netloc = self.netloc
201 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
202 if have_info:
203 username, have_password, password = userinfo.partition(b':')
204 if not have_password:
205 password = None
206 else:
207 username = password = None
208 return username, password
209
210 @property
211 def _hostinfo(self):
212 netloc = self.netloc
213 _, _, hostinfo = netloc.rpartition(b'@')
214 _, have_open_br, bracketed = hostinfo.partition(b'[')
215 if have_open_br:
216 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200217 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000218 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200219 hostname, _, port = hostinfo.partition(b':')
220 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000221 port = None
222 return hostname, port
223
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000224
225from collections import namedtuple
226
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000227_DefragResultBase = namedtuple('DefragResult', 'url fragment')
Senthil Kumaran86f71092016-01-14 00:11:39 -0800228_SplitResultBase = namedtuple(
229 'SplitResult', 'scheme netloc path query fragment')
230_ParseResultBase = namedtuple(
231 'ParseResult', 'scheme netloc path params query fragment')
232
233_DefragResultBase.__doc__ = """
234DefragResult(url, fragment)
235
236A 2-tuple that contains the url without fragment identifier and the fragment
237identifier as a separate argument.
238"""
239
240_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
241
242_DefragResultBase.fragment.__doc__ = """
243Fragment identifier separated from URL, that allows indirect identification of a
244secondary resource by reference to a primary resource and additional identifying
245information.
246"""
247
248_SplitResultBase.__doc__ = """
249SplitResult(scheme, netloc, path, query, fragment)
250
251A 5-tuple that contains the different components of a URL. Similar to
252ParseResult, but does not split params.
253"""
254
255_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
256
257_SplitResultBase.netloc.__doc__ = """
258Network location where the request is made to.
259"""
260
261_SplitResultBase.path.__doc__ = """
262The hierarchical path, such as the path to a file to download.
263"""
264
265_SplitResultBase.query.__doc__ = """
266The query component, that contains non-hierarchical data, that along with data
267in path component, identifies a resource in the scope of URI's scheme and
268network location.
269"""
270
271_SplitResultBase.fragment.__doc__ = """
272Fragment identifier, that allows indirect identification of a secondary resource
273by reference to a primary resource and additional identifying information.
274"""
275
276_ParseResultBase.__doc__ = """
277ParseResult(scheme, netloc, path, params, query, fragment)
278
279A 6-tuple that contains components of a parsed URL.
280"""
281
282_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
283_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
284_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
285_ParseResultBase.params.__doc__ = """
286Parameters for last path element used to dereference the URI in order to provide
287access to perform some operation on the resource.
288"""
289
290_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
291_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
292
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000293
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000294# For backwards compatibility, alias _NetlocResultMixinStr
295# ResultBase is no longer part of the documented API, but it is
296# retained since deprecating it isn't worth the hassle
297ResultBase = _NetlocResultMixinStr
298
299# Structured result objects for string data
300class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000301 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000302 def geturl(self):
303 if self.fragment:
304 return self.url + '#' + self.fragment
305 else:
306 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000307
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000308class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
309 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000310 def geturl(self):
311 return urlunsplit(self)
312
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000313class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000314 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000315 def geturl(self):
316 return urlunparse(self)
317
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000318# Structured result objects for bytes data
319class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
320 __slots__ = ()
321 def geturl(self):
322 if self.fragment:
323 return self.url + b'#' + self.fragment
324 else:
325 return self.url
326
327class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
328 __slots__ = ()
329 def geturl(self):
330 return urlunsplit(self)
331
332class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
333 __slots__ = ()
334 def geturl(self):
335 return urlunparse(self)
336
337# Set up the encode/decode result pairs
338def _fix_result_transcoding():
339 _result_pairs = (
340 (DefragResult, DefragResultBytes),
341 (SplitResult, SplitResultBytes),
342 (ParseResult, ParseResultBytes),
343 )
344 for _decoded, _encoded in _result_pairs:
345 _decoded._encoded_counterpart = _encoded
346 _encoded._decoded_counterpart = _decoded
347
348_fix_result_transcoding()
349del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000350
351def urlparse(url, scheme='', allow_fragments=True):
352 """Parse a URL into 6 components:
353 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
354 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
355 Note that we don't break the components up in smaller bits
356 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000357 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700358 splitresult = urlsplit(url, scheme, allow_fragments)
359 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000360 if scheme in uses_params and ';' in url:
361 url, params = _splitparams(url)
362 else:
363 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000364 result = ParseResult(scheme, netloc, url, params, query, fragment)
365 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000366
367def _splitparams(url):
368 if '/' in url:
369 i = url.find(';', url.rfind('/'))
370 if i < 0:
371 return url, ''
372 else:
373 i = url.find(';')
374 return url[:i], url[i+1:]
375
376def _splitnetloc(url, start=0):
377 delim = len(url) # position of end of domain part of url, default is end
378 for c in '/?#': # look for delimiters; the order is NOT important
379 wdelim = url.find(c, start) # find first of this delim
380 if wdelim >= 0: # if found
381 delim = min(delim, wdelim) # use earliest delim position
382 return url[start:delim], url[delim:] # return (domain, rest)
383
384def urlsplit(url, scheme='', allow_fragments=True):
385 """Parse a URL into 5 components:
386 <scheme>://<netloc>/<path>?<query>#<fragment>
387 Return a 5-tuple: (scheme, netloc, path, query, fragment).
388 Note that we don't break the components up in smaller bits
389 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000390 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000391 allow_fragments = bool(allow_fragments)
392 key = url, scheme, allow_fragments, type(url), type(scheme)
393 cached = _parse_cache.get(key, None)
394 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000395 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000396 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
397 clear_cache()
398 netloc = query = fragment = ''
399 i = url.find(':')
400 if i > 0:
401 if url[:i] == 'http': # optimize the common case
402 scheme = url[:i].lower()
403 url = url[i+1:]
404 if url[:2] == '//':
405 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000406 if (('[' in netloc and ']' not in netloc) or
407 (']' in netloc and '[' not in netloc)):
408 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000409 if allow_fragments and '#' in url:
410 url, fragment = url.split('#', 1)
411 if '?' in url:
412 url, query = url.split('?', 1)
413 v = SplitResult(scheme, netloc, url, query, fragment)
414 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000415 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800416 for c in url[:i]:
417 if c not in scheme_chars:
418 break
419 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300420 # make sure "url" is not actually a port number (in which case
421 # "scheme" is really part of the path)
422 rest = url[i+1:]
423 if not rest or any(c not in '0123456789' for c in rest):
424 # not a port number
425 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800426
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000427 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000428 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000429 if (('[' in netloc and ']' not in netloc) or
430 (']' in netloc and '[' not in netloc)):
431 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800432 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000433 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800434 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000435 url, query = url.split('?', 1)
436 v = SplitResult(scheme, netloc, url, query, fragment)
437 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000438 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000439
440def urlunparse(components):
441 """Put a parsed URL back together again. This may result in a
442 slightly different, but equivalent URL, if the URL that was parsed
443 originally had redundant delimiters, e.g. a ? with an empty query
444 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000445 scheme, netloc, url, params, query, fragment, _coerce_result = (
446 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000447 if params:
448 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000449 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000450
451def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000452 """Combine the elements of a tuple as returned by urlsplit() into a
453 complete URL as a string. The data argument can be any five-item iterable.
454 This may result in a slightly different, but equivalent URL, if the URL that
455 was parsed originally had unnecessary delimiters (for example, a ? with an
456 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000457 scheme, netloc, url, query, fragment, _coerce_result = (
458 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000459 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
460 if url and url[:1] != '/': url = '/' + url
461 url = '//' + (netloc or '') + url
462 if scheme:
463 url = scheme + ':' + url
464 if query:
465 url = url + '?' + query
466 if fragment:
467 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000468 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000469
470def urljoin(base, url, allow_fragments=True):
471 """Join a base URL and a possibly relative URL to form an absolute
472 interpretation of the latter."""
473 if not base:
474 return url
475 if not url:
476 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400477
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000478 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000479 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
480 urlparse(base, '', allow_fragments)
481 scheme, netloc, path, params, query, fragment = \
482 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400483
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000484 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000485 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000486 if scheme in uses_netloc:
487 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000488 return _coerce_result(urlunparse((scheme, netloc, path,
489 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000490 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400491
Senthil Kumarandca5b862010-12-17 04:48:45 +0000492 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000493 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000494 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000495 if not query:
496 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000497 return _coerce_result(urlunparse((scheme, netloc, path,
498 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400499
500 base_parts = bpath.split('/')
501 if base_parts[-1] != '':
502 # the last item is not a directory, so will not be taken into account
503 # in resolving the relative path
504 del base_parts[-1]
505
506 # for rfc3986, ignore all base path should the first character be root.
507 if path[:1] == '/':
508 segments = path.split('/')
509 else:
510 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800511 # filter out elements that would cause redundant slashes on re-joining
512 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300513 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400514
515 resolved_path = []
516
517 for seg in segments:
518 if seg == '..':
519 try:
520 resolved_path.pop()
521 except IndexError:
522 # ignore any .. segments that would otherwise cause an IndexError
523 # when popped from resolved_path if resolving for rfc3986
524 pass
525 elif seg == '.':
526 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000527 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400528 resolved_path.append(seg)
529
530 if segments[-1] in ('.', '..'):
531 # do some post-processing here. if the last segment was a relative dir,
532 # then we need to append the trailing '/'
533 resolved_path.append('')
534
535 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800536 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400537
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000538
539def urldefrag(url):
540 """Removes any existing fragment from URL.
541
542 Returns a tuple of the defragmented URL and the fragment. If
543 the URL contained no fragments, the second element is the
544 empty string.
545 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000546 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000547 if '#' in url:
548 s, n, p, a, q, frag = urlparse(url)
549 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000550 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000551 frag = ''
552 defrag = url
553 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000554
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200555_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100556_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200557
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000558def unquote_to_bytes(string):
559 """unquote_to_bytes('abc%20def') -> b'abc def'."""
560 # Note: strings are encoded as UTF-8. This is only an issue if it contains
561 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000562 if not string:
563 # Is it a string-like object?
564 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000565 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000566 if isinstance(string, str):
567 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200568 bits = string.split(b'%')
569 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000570 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200571 res = [bits[0]]
572 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100573 # Delay the initialization of the table to not waste memory
574 # if the function is never called
575 global _hextobyte
576 if _hextobyte is None:
577 _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
578 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200579 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000580 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200581 append(_hextobyte[item[:2]])
582 append(item[2:])
583 except KeyError:
584 append(b'%')
585 append(item)
586 return b''.join(res)
587
588_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000589
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000590def unquote(string, encoding='utf-8', errors='replace'):
591 """Replace %xx escapes by their single-character equivalent. The optional
592 encoding and errors parameters specify how to decode percent-encoded
593 sequences into Unicode characters, as accepted by the bytes.decode()
594 method.
595 By default, percent-encoded sequences are decoded with UTF-8, and invalid
596 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000597
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000598 unquote('abc%20def') -> 'abc def'.
599 """
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200600 if '%' not in string:
601 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000602 return string
603 if encoding is None:
604 encoding = 'utf-8'
605 if errors is None:
606 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200607 bits = _asciire.split(string)
608 res = [bits[0]]
609 append = res.append
610 for i in range(1, len(bits), 2):
611 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
612 append(bits[i + 1])
613 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000614
Victor Stinnerac71c542011-01-14 12:52:12 +0000615def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
616 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000617 """Parse a query given as a string argument.
618
619 Arguments:
620
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000621 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000622
623 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000624 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000625 A true value indicates that blanks should be retained as
626 blank strings. The default false value indicates that
627 blank values are to be ignored and treated as if they were
628 not included.
629
630 strict_parsing: flag indicating what to do with parsing errors.
631 If false (the default), errors are silently ignored.
632 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000633
634 encoding and errors: specify how to decode percent-encoded sequences
635 into Unicode characters, as accepted by the bytes.decode() method.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000636 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700637 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000638 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
639 encoding=encoding, errors=errors)
640 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700641 if name in parsed_result:
642 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000643 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700644 parsed_result[name] = [value]
645 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000646
Victor Stinnerac71c542011-01-14 12:52:12 +0000647def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
648 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000649 """Parse a query given as a string argument.
650
651 Arguments:
652
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000653 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000654
655 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000656 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000657 true value indicates that blanks should be retained as blank
658 strings. The default false value indicates that blank values
659 are to be ignored and treated as if they were not included.
660
661 strict_parsing: flag indicating what to do with parsing errors. If
662 false (the default), errors are silently ignored. If true,
663 errors raise a ValueError exception.
664
Victor Stinnerac71c542011-01-14 12:52:12 +0000665 encoding and errors: specify how to decode percent-encoded sequences
666 into Unicode characters, as accepted by the bytes.decode() method.
667
Facundo Batistac469d4c2008-09-03 22:49:01 +0000668 Returns a list, as G-d intended.
669 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000670 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000671 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
672 r = []
673 for name_value in pairs:
674 if not name_value and not strict_parsing:
675 continue
676 nv = name_value.split('=', 1)
677 if len(nv) != 2:
678 if strict_parsing:
679 raise ValueError("bad query field: %r" % (name_value,))
680 # Handle case of a control-name with no equal sign
681 if keep_blank_values:
682 nv.append('')
683 else:
684 continue
685 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000686 name = nv[0].replace('+', ' ')
687 name = unquote(name, encoding=encoding, errors=errors)
688 name = _coerce_result(name)
689 value = nv[1].replace('+', ' ')
690 value = unquote(value, encoding=encoding, errors=errors)
691 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000692 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000693 return r
694
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000695def unquote_plus(string, encoding='utf-8', errors='replace'):
696 """Like unquote(), but also replace plus signs by spaces, as required for
697 unquoting HTML form values.
698
699 unquote_plus('%7e/abc+def') -> '~/abc def'
700 """
701 string = string.replace('+', ' ')
702 return unquote(string, encoding, errors)
703
704_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
705 b'abcdefghijklmnopqrstuvwxyz'
706 b'0123456789'
707 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000708_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
709_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000710
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000711class Quoter(collections.defaultdict):
712 """A mapping from bytes (in range(0,256)) to strings.
713
714 String values are percent-encoded byte values, unless the key < 128, and
715 in the "safe" set (either the specified safe set, or default set).
716 """
717 # Keeps a cache internally, using defaultdict, for efficiency (lookups
718 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000719 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000720 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000721 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000722
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000723 def __repr__(self):
724 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300725 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000726
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000727 def __missing__(self, b):
728 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000729 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000730 self[b] = res
731 return res
732
733def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000734 """quote('abc def') -> 'abc%20def'
735
736 Each part of a URL, e.g. the path info, the query, etc., has a
737 different set of reserved characters that must be quoted.
738
739 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
740 the following reserved characters.
741
742 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
743 "$" | ","
744
745 Each of these characters is reserved in some component of a URL,
746 but not necessarily in all of them.
747
748 By default, the quote function is intended for quoting the path
749 section of a URL. Thus, it will not encode '/'. This character
750 is reserved, but in typical usage the quote function is being
751 called on a path where the existing slash characters are used as
752 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000753
R David Murray8c4e1122014-12-24 21:23:18 -0500754 string and safe may be either str or bytes objects. encoding and errors
755 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000756
757 The optional encoding and errors parameters specify how to deal with
758 non-ASCII characters, as accepted by the str.encode method.
759 By default, encoding='utf-8' (characters are encoded with UTF-8), and
760 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000761 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000762 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000763 if not string:
764 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000765 if encoding is None:
766 encoding = 'utf-8'
767 if errors is None:
768 errors = 'strict'
769 string = string.encode(encoding, errors)
770 else:
771 if encoding is not None:
772 raise TypeError("quote() doesn't support 'encoding' for bytes")
773 if errors is not None:
774 raise TypeError("quote() doesn't support 'errors' for bytes")
775 return quote_from_bytes(string, safe)
776
777def quote_plus(string, safe='', encoding=None, errors=None):
778 """Like quote(), but also replace ' ' with '+', as required for quoting
779 HTML form values. Plus signs in the original string are escaped unless
780 they are included in safe. It also does not have safe default to '/'.
781 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000782 # Check if ' ' in string, where string may either be a str or bytes. If
783 # there are no spaces, the regular quote will produce the right answer.
784 if ((isinstance(string, str) and ' ' not in string) or
785 (isinstance(string, bytes) and b' ' not in string)):
786 return quote(string, safe, encoding, errors)
787 if isinstance(safe, str):
788 space = ' '
789 else:
790 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000791 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000792 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000793
794def quote_from_bytes(bs, safe='/'):
795 """Like quote(), but accepts a bytes object rather than a str, and does
796 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800797 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000798 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000799 if not isinstance(bs, (bytes, bytearray)):
800 raise TypeError("quote_from_bytes() expected bytes")
801 if not bs:
802 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000803 if isinstance(safe, str):
804 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
805 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000806 else:
807 safe = bytes([c for c in safe if c < 128])
808 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
809 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000810 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000811 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000812 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000813 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
814 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000815
R David Murrayc17686f2015-05-17 20:44:50 -0400816def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
817 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700818 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000819
820 If any values in the query arg are sequences and doseq is true, each
821 sequence element is converted to a separate parameter.
822
823 If the query arg is a sequence of two-element tuples, the order of the
824 parameters in the output will match the order of parameters in the
825 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000826
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700827 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500828
R David Murrayc17686f2015-05-17 20:44:50 -0400829 The safe, encoding, and errors parameters are passed down to the function
830 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000831 """
832
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000833 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000834 query = query.items()
835 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000836 # It's a bother at times that strings and string-like objects are
837 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000838 try:
839 # non-sequence items should not work with len()
840 # non-empty strings will fail this
841 if len(query) and not isinstance(query[0], tuple):
842 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000843 # Zero-length sequences of all types will get here and succeed,
844 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000845 # allowed empty dicts that type of behavior probably should be
846 # preserved for consistency
847 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000848 ty, va, tb = sys.exc_info()
849 raise TypeError("not a valid non-string sequence "
850 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000851
852 l = []
853 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000854 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000855 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400856 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000857 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400858 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000859
860 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400861 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000862 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400863 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000864 l.append(k + '=' + v)
865 else:
866 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000867 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400868 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000869 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400870 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000871
872 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400873 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000874 l.append(k + '=' + v)
875 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400876 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000877 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000878 else:
879 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000880 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000881 x = len(v)
882 except TypeError:
883 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400884 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000885 l.append(k + '=' + v)
886 else:
887 # loop over the sequence
888 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000889 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400890 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000891 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400892 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000893 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000894 return '&'.join(l)
895
Georg Brandl13e89462008-07-01 19:56:00 +0000896def to_bytes(url):
897 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000898 # Most URL schemes require ASCII. If that changes, the conversion
899 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000900 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901 if isinstance(url, str):
902 try:
903 url = url.encode("ASCII").decode()
904 except UnicodeError:
905 raise UnicodeError("URL " + repr(url) +
906 " contains non-ASCII characters")
907 return url
908
909def unwrap(url):
910 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
911 url = str(url).strip()
912 if url[:1] == '<' and url[-1:] == '>':
913 url = url[1:-1].strip()
914 if url[:4] == 'URL:': url = url[4:].strip()
915 return url
916
917_typeprog = None
918def splittype(url):
919 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
920 global _typeprog
921 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200922 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000923
924 match = _typeprog.match(url)
925 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200926 scheme, data = match.groups()
927 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000928 return None, url
929
930_hostprog = None
931def splithost(url):
932 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
933 global _hostprog
934 if _hostprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200935 _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000936
937 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000938 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200939 host_port, path = match.groups()
940 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +0000941 path = '/' + path
942 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000943 return None, url
944
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000945def splituser(host):
946 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200947 user, delim, host = host.rpartition('@')
948 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000949
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000950def splitpasswd(user):
951 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200952 user, delim, passwd = user.partition(':')
953 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000954
955# splittag('/path#tag') --> '/path', 'tag'
956_portprog = None
957def splitport(host):
958 """splitport('host:port') --> 'host', 'port'."""
959 global _portprog
960 if _portprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200961 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000962
963 match = _portprog.match(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200964 if match:
965 host, port = match.groups()
966 if port:
967 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000968 return host, None
969
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000970def splitnport(host, defport=-1):
971 """Split host and port, returning numeric port.
972 Return given default port if no ':' found; defaults to -1.
973 Return numerical port if a valid number are found after ':'.
974 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200975 host, delim, port = host.rpartition(':')
976 if not delim:
977 host = port
978 elif port:
979 try:
980 nport = int(port)
981 except ValueError:
982 nport = None
983 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000984 return host, defport
985
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000986def splitquery(url):
987 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200988 path, delim, query = url.rpartition('?')
989 if delim:
990 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000991 return url, None
992
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000993def splittag(url):
994 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200995 path, delim, tag = url.rpartition('#')
996 if delim:
997 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000998 return url, None
999
1000def splitattr(url):
1001 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1002 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1003 words = url.split(';')
1004 return words[0], words[1:]
1005
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001006def splitvalue(attr):
1007 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +02001008 attr, delim, value = attr.partition('=')
1009 return attr, (value if delim else None)