blob: 566fbf7188818622de15aaf7302d1d3a7e053762 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000033
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000035 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030037 "unquote", "unquote_plus", "unquote_to_bytes",
38 "DefragResult", "ParseResult", "SplitResult",
39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000040
41# A classification of schemes ('' means apply by default)
42uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
43 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran2a157d22011-08-03 18:37:22 +080044 'prospero', 'rtsp', 'rtspu', '', 'sftp',
Berker Peksagf6767482016-09-16 14:43:58 +030045 'svn', 'svn+ssh', 'ws', 'wss']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
47 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
48 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Berker Peksagf6767482016-09-16 14:43:58 +030049 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
50 'ws', 'wss']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
52 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaraned301992012-12-24 14:00:20 -080053 'mms', '', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000054
Georg Brandla61b09f2012-08-24 18:15:29 +020055# These are not actually used anymore, but should stay for backwards
56# compatibility. (They are undocumented, but have a public-looking name.)
57non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
58 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
59uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
60 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
61uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
62 'nntp', 'wais', 'https', 'shttp', 'snews',
63 'file', 'prospero', '']
64
Jeremy Hylton1afc1692008-06-18 20:49:58 +000065# Characters valid in scheme names
66scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
67 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
68 '0123456789'
69 '+-.')
70
Nick Coghlan9fc443c2010-11-30 15:48:08 +000071# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000072MAX_CACHE_SIZE = 20
73_parse_cache = {}
74
75def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000076 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000077 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000078 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000079
80
Nick Coghlan9fc443c2010-11-30 15:48:08 +000081# Helpers for bytes handling
82# For 3.2, we deliberately require applications that
83# handle improperly quoted URLs to do their own
84# decoding and encoding. If valid use cases are
85# presented, we may relax this by using latin-1
86# decoding internally for 3.3
87_implicit_encoding = 'ascii'
88_implicit_errors = 'strict'
89
90def _noop(obj):
91 return obj
92
93def _encode_result(obj, encoding=_implicit_encoding,
94 errors=_implicit_errors):
95 return obj.encode(encoding, errors)
96
97def _decode_args(args, encoding=_implicit_encoding,
98 errors=_implicit_errors):
99 return tuple(x.decode(encoding, errors) if x else '' for x in args)
100
101def _coerce_args(*args):
102 # Invokes decode if necessary to create str args
103 # and returns the coerced inputs along with
104 # an appropriate result coercion function
105 # - noop for str inputs
106 # - encoding function otherwise
107 str_input = isinstance(args[0], str)
108 for arg in args[1:]:
109 # We special-case the empty string to support the
110 # "scheme=''" default argument to some functions
111 if arg and isinstance(arg, str) != str_input:
112 raise TypeError("Cannot mix str and non-str arguments")
113 if str_input:
114 return args + (_noop,)
115 return _decode_args(args) + (_encode_result,)
116
117# Result objects are more helpful than simple tuples
118class _ResultMixinStr(object):
119 """Standard approach to encoding parsed results from str to bytes"""
120 __slots__ = ()
121
122 def encode(self, encoding='ascii', errors='strict'):
123 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
124
125
126class _ResultMixinBytes(object):
127 """Standard approach to decoding parsed results from bytes to str"""
128 __slots__ = ()
129
130 def decode(self, encoding='ascii', errors='strict'):
131 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
132
133
134class _NetlocResultMixinBase(object):
135 """Shared methods for the parsed result objects containing a netloc element"""
136 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000137
138 @property
139 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000140 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000141
142 @property
143 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000144 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000145
146 @property
147 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000148 hostname = self._hostinfo[0]
149 if not hostname:
150 hostname = None
151 elif hostname is not None:
152 hostname = hostname.lower()
153 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000154
155 @property
156 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000157 port = self._hostinfo[1]
158 if port is not None:
159 port = int(port, 10)
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800160 # Return None on an illegal port
161 if not ( 0 <= port <= 65535):
162 return None
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000163 return port
164
165
166class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
167 __slots__ = ()
168
169 @property
170 def _userinfo(self):
171 netloc = self.netloc
172 userinfo, have_info, hostinfo = netloc.rpartition('@')
173 if have_info:
174 username, have_password, password = userinfo.partition(':')
175 if not have_password:
176 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000177 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000178 username = password = None
179 return username, password
180
181 @property
182 def _hostinfo(self):
183 netloc = self.netloc
184 _, _, hostinfo = netloc.rpartition('@')
185 _, have_open_br, bracketed = hostinfo.partition('[')
186 if have_open_br:
187 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200188 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000189 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200190 hostname, _, port = hostinfo.partition(':')
191 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000192 port = None
193 return hostname, port
194
195
196class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
197 __slots__ = ()
198
199 @property
200 def _userinfo(self):
201 netloc = self.netloc
202 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
203 if have_info:
204 username, have_password, password = userinfo.partition(b':')
205 if not have_password:
206 password = None
207 else:
208 username = password = None
209 return username, password
210
211 @property
212 def _hostinfo(self):
213 netloc = self.netloc
214 _, _, hostinfo = netloc.rpartition(b'@')
215 _, have_open_br, bracketed = hostinfo.partition(b'[')
216 if have_open_br:
217 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200218 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000219 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200220 hostname, _, port = hostinfo.partition(b':')
221 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000222 port = None
223 return hostname, port
224
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000225
226from collections import namedtuple
227
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000228_DefragResultBase = namedtuple('DefragResult', 'url fragment')
229_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
230_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000231
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000232# For backwards compatibility, alias _NetlocResultMixinStr
233# ResultBase is no longer part of the documented API, but it is
234# retained since deprecating it isn't worth the hassle
235ResultBase = _NetlocResultMixinStr
236
237# Structured result objects for string data
238class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000239 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000240 def geturl(self):
241 if self.fragment:
242 return self.url + '#' + self.fragment
243 else:
244 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000245
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000246class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
247 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000248 def geturl(self):
249 return urlunsplit(self)
250
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000251class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000252 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000253 def geturl(self):
254 return urlunparse(self)
255
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000256# Structured result objects for bytes data
257class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
258 __slots__ = ()
259 def geturl(self):
260 if self.fragment:
261 return self.url + b'#' + self.fragment
262 else:
263 return self.url
264
265class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
266 __slots__ = ()
267 def geturl(self):
268 return urlunsplit(self)
269
270class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
271 __slots__ = ()
272 def geturl(self):
273 return urlunparse(self)
274
275# Set up the encode/decode result pairs
276def _fix_result_transcoding():
277 _result_pairs = (
278 (DefragResult, DefragResultBytes),
279 (SplitResult, SplitResultBytes),
280 (ParseResult, ParseResultBytes),
281 )
282 for _decoded, _encoded in _result_pairs:
283 _decoded._encoded_counterpart = _encoded
284 _encoded._decoded_counterpart = _decoded
285
286_fix_result_transcoding()
287del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000288
289def urlparse(url, scheme='', allow_fragments=True):
290 """Parse a URL into 6 components:
291 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
292 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
293 Note that we don't break the components up in smaller bits
294 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000295 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700296 splitresult = urlsplit(url, scheme, allow_fragments)
297 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000298 if scheme in uses_params and ';' in url:
299 url, params = _splitparams(url)
300 else:
301 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000302 result = ParseResult(scheme, netloc, url, params, query, fragment)
303 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000304
305def _splitparams(url):
306 if '/' in url:
307 i = url.find(';', url.rfind('/'))
308 if i < 0:
309 return url, ''
310 else:
311 i = url.find(';')
312 return url[:i], url[i+1:]
313
314def _splitnetloc(url, start=0):
315 delim = len(url) # position of end of domain part of url, default is end
316 for c in '/?#': # look for delimiters; the order is NOT important
317 wdelim = url.find(c, start) # find first of this delim
318 if wdelim >= 0: # if found
319 delim = min(delim, wdelim) # use earliest delim position
320 return url[start:delim], url[delim:] # return (domain, rest)
321
322def urlsplit(url, scheme='', allow_fragments=True):
323 """Parse a URL into 5 components:
324 <scheme>://<netloc>/<path>?<query>#<fragment>
325 Return a 5-tuple: (scheme, netloc, path, query, fragment).
326 Note that we don't break the components up in smaller bits
327 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000328 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000329 allow_fragments = bool(allow_fragments)
330 key = url, scheme, allow_fragments, type(url), type(scheme)
331 cached = _parse_cache.get(key, None)
332 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000333 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000334 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
335 clear_cache()
336 netloc = query = fragment = ''
337 i = url.find(':')
338 if i > 0:
339 if url[:i] == 'http': # optimize the common case
340 scheme = url[:i].lower()
341 url = url[i+1:]
342 if url[:2] == '//':
343 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000344 if (('[' in netloc and ']' not in netloc) or
345 (']' in netloc and '[' not in netloc)):
346 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000347 if allow_fragments and '#' in url:
348 url, fragment = url.split('#', 1)
349 if '?' in url:
350 url, query = url.split('?', 1)
351 v = SplitResult(scheme, netloc, url, query, fragment)
352 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000353 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800354 for c in url[:i]:
355 if c not in scheme_chars:
356 break
357 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300358 # make sure "url" is not actually a port number (in which case
359 # "scheme" is really part of the path)
360 rest = url[i+1:]
361 if not rest or any(c not in '0123456789' for c in rest):
362 # not a port number
363 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800364
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000365 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000366 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000367 if (('[' in netloc and ']' not in netloc) or
368 (']' in netloc and '[' not in netloc)):
369 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800370 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000371 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800372 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000373 url, query = url.split('?', 1)
374 v = SplitResult(scheme, netloc, url, query, fragment)
375 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000376 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000377
378def urlunparse(components):
379 """Put a parsed URL back together again. This may result in a
380 slightly different, but equivalent URL, if the URL that was parsed
381 originally had redundant delimiters, e.g. a ? with an empty query
382 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000383 scheme, netloc, url, params, query, fragment, _coerce_result = (
384 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000385 if params:
386 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000387 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000388
389def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000390 """Combine the elements of a tuple as returned by urlsplit() into a
391 complete URL as a string. The data argument can be any five-item iterable.
392 This may result in a slightly different, but equivalent URL, if the URL that
393 was parsed originally had unnecessary delimiters (for example, a ? with an
394 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000395 scheme, netloc, url, query, fragment, _coerce_result = (
396 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000397 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
398 if url and url[:1] != '/': url = '/' + url
399 url = '//' + (netloc or '') + url
400 if scheme:
401 url = scheme + ':' + url
402 if query:
403 url = url + '?' + query
404 if fragment:
405 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000406 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000407
408def urljoin(base, url, allow_fragments=True):
409 """Join a base URL and a possibly relative URL to form an absolute
410 interpretation of the latter."""
411 if not base:
412 return url
413 if not url:
414 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400415
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000416 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000417 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
418 urlparse(base, '', allow_fragments)
419 scheme, netloc, path, params, query, fragment = \
420 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400421
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000422 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000423 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000424 if scheme in uses_netloc:
425 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000426 return _coerce_result(urlunparse((scheme, netloc, path,
427 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000428 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400429
Senthil Kumarandca5b862010-12-17 04:48:45 +0000430 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000431 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000432 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000433 if not query:
434 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000435 return _coerce_result(urlunparse((scheme, netloc, path,
436 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400437
438 base_parts = bpath.split('/')
439 if base_parts[-1] != '':
440 # the last item is not a directory, so will not be taken into account
441 # in resolving the relative path
442 del base_parts[-1]
443
444 # for rfc3986, ignore all base path should the first character be root.
445 if path[:1] == '/':
446 segments = path.split('/')
447 else:
448 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800449 # filter out elements that would cause redundant slashes on re-joining
450 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300451 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400452
453 resolved_path = []
454
455 for seg in segments:
456 if seg == '..':
457 try:
458 resolved_path.pop()
459 except IndexError:
460 # ignore any .. segments that would otherwise cause an IndexError
461 # when popped from resolved_path if resolving for rfc3986
462 pass
463 elif seg == '.':
464 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000465 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400466 resolved_path.append(seg)
467
468 if segments[-1] in ('.', '..'):
469 # do some post-processing here. if the last segment was a relative dir,
470 # then we need to append the trailing '/'
471 resolved_path.append('')
472
473 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800474 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400475
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000476
477def urldefrag(url):
478 """Removes any existing fragment from URL.
479
480 Returns a tuple of the defragmented URL and the fragment. If
481 the URL contained no fragments, the second element is the
482 empty string.
483 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000484 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000485 if '#' in url:
486 s, n, p, a, q, frag = urlparse(url)
487 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000488 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000489 frag = ''
490 defrag = url
491 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000492
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200493_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100494_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200495
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000496def unquote_to_bytes(string):
497 """unquote_to_bytes('abc%20def') -> b'abc def'."""
498 # Note: strings are encoded as UTF-8. This is only an issue if it contains
499 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000500 if not string:
501 # Is it a string-like object?
502 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000503 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000504 if isinstance(string, str):
505 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200506 bits = string.split(b'%')
507 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000508 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200509 res = [bits[0]]
510 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100511 # Delay the initialization of the table to not waste memory
512 # if the function is never called
513 global _hextobyte
514 if _hextobyte is None:
515 _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
516 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200517 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000518 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200519 append(_hextobyte[item[:2]])
520 append(item[2:])
521 except KeyError:
522 append(b'%')
523 append(item)
524 return b''.join(res)
525
526_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000527
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000528def unquote(string, encoding='utf-8', errors='replace'):
529 """Replace %xx escapes by their single-character equivalent. The optional
530 encoding and errors parameters specify how to decode percent-encoded
531 sequences into Unicode characters, as accepted by the bytes.decode()
532 method.
533 By default, percent-encoded sequences are decoded with UTF-8, and invalid
534 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000535
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000536 unquote('abc%20def') -> 'abc def'.
537 """
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200538 if '%' not in string:
539 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000540 return string
541 if encoding is None:
542 encoding = 'utf-8'
543 if errors is None:
544 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200545 bits = _asciire.split(string)
546 res = [bits[0]]
547 append = res.append
548 for i in range(1, len(bits), 2):
549 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
550 append(bits[i + 1])
551 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000552
Victor Stinnerac71c542011-01-14 12:52:12 +0000553def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
554 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000555 """Parse a query given as a string argument.
556
557 Arguments:
558
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000559 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000560
561 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000562 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000563 A true value indicates that blanks should be retained as
564 blank strings. The default false value indicates that
565 blank values are to be ignored and treated as if they were
566 not included.
567
568 strict_parsing: flag indicating what to do with parsing errors.
569 If false (the default), errors are silently ignored.
570 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000571
572 encoding and errors: specify how to decode percent-encoded sequences
573 into Unicode characters, as accepted by the bytes.decode() method.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000574 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700575 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000576 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
577 encoding=encoding, errors=errors)
578 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700579 if name in parsed_result:
580 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000581 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700582 parsed_result[name] = [value]
583 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000584
Victor Stinnerac71c542011-01-14 12:52:12 +0000585def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
586 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000587 """Parse a query given as a string argument.
588
589 Arguments:
590
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000591 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000592
593 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000594 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000595 true value indicates that blanks should be retained as blank
596 strings. The default false value indicates that blank values
597 are to be ignored and treated as if they were not included.
598
599 strict_parsing: flag indicating what to do with parsing errors. If
600 false (the default), errors are silently ignored. If true,
601 errors raise a ValueError exception.
602
Victor Stinnerac71c542011-01-14 12:52:12 +0000603 encoding and errors: specify how to decode percent-encoded sequences
604 into Unicode characters, as accepted by the bytes.decode() method.
605
Facundo Batistac469d4c2008-09-03 22:49:01 +0000606 Returns a list, as G-d intended.
607 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000608 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000609 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
610 r = []
611 for name_value in pairs:
612 if not name_value and not strict_parsing:
613 continue
614 nv = name_value.split('=', 1)
615 if len(nv) != 2:
616 if strict_parsing:
617 raise ValueError("bad query field: %r" % (name_value,))
618 # Handle case of a control-name with no equal sign
619 if keep_blank_values:
620 nv.append('')
621 else:
622 continue
623 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000624 name = nv[0].replace('+', ' ')
625 name = unquote(name, encoding=encoding, errors=errors)
626 name = _coerce_result(name)
627 value = nv[1].replace('+', ' ')
628 value = unquote(value, encoding=encoding, errors=errors)
629 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000630 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000631 return r
632
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000633def unquote_plus(string, encoding='utf-8', errors='replace'):
634 """Like unquote(), but also replace plus signs by spaces, as required for
635 unquoting HTML form values.
636
637 unquote_plus('%7e/abc+def') -> '~/abc def'
638 """
639 string = string.replace('+', ' ')
640 return unquote(string, encoding, errors)
641
642_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
643 b'abcdefghijklmnopqrstuvwxyz'
644 b'0123456789'
645 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000646_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
647_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000648
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000649class Quoter(collections.defaultdict):
650 """A mapping from bytes (in range(0,256)) to strings.
651
652 String values are percent-encoded byte values, unless the key < 128, and
653 in the "safe" set (either the specified safe set, or default set).
654 """
655 # Keeps a cache internally, using defaultdict, for efficiency (lookups
656 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000657 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000658 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000659 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000660
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000661 def __repr__(self):
662 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300663 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000664
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000665 def __missing__(self, b):
666 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000667 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000668 self[b] = res
669 return res
670
671def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000672 """quote('abc def') -> 'abc%20def'
673
674 Each part of a URL, e.g. the path info, the query, etc., has a
675 different set of reserved characters that must be quoted.
676
677 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
678 the following reserved characters.
679
680 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
681 "$" | ","
682
683 Each of these characters is reserved in some component of a URL,
684 but not necessarily in all of them.
685
686 By default, the quote function is intended for quoting the path
687 section of a URL. Thus, it will not encode '/'. This character
688 is reserved, but in typical usage the quote function is being
689 called on a path where the existing slash characters are used as
690 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000691
R David Murray8c4e1122014-12-24 21:23:18 -0500692 string and safe may be either str or bytes objects. encoding and errors
693 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000694
695 The optional encoding and errors parameters specify how to deal with
696 non-ASCII characters, as accepted by the str.encode method.
697 By default, encoding='utf-8' (characters are encoded with UTF-8), and
698 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000699 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000700 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000701 if not string:
702 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000703 if encoding is None:
704 encoding = 'utf-8'
705 if errors is None:
706 errors = 'strict'
707 string = string.encode(encoding, errors)
708 else:
709 if encoding is not None:
710 raise TypeError("quote() doesn't support 'encoding' for bytes")
711 if errors is not None:
712 raise TypeError("quote() doesn't support 'errors' for bytes")
713 return quote_from_bytes(string, safe)
714
715def quote_plus(string, safe='', encoding=None, errors=None):
716 """Like quote(), but also replace ' ' with '+', as required for quoting
717 HTML form values. Plus signs in the original string are escaped unless
718 they are included in safe. It also does not have safe default to '/'.
719 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000720 # Check if ' ' in string, where string may either be a str or bytes. If
721 # there are no spaces, the regular quote will produce the right answer.
722 if ((isinstance(string, str) and ' ' not in string) or
723 (isinstance(string, bytes) and b' ' not in string)):
724 return quote(string, safe, encoding, errors)
725 if isinstance(safe, str):
726 space = ' '
727 else:
728 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000729 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000730 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000731
732def quote_from_bytes(bs, safe='/'):
733 """Like quote(), but accepts a bytes object rather than a str, and does
734 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800735 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000736 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000737 if not isinstance(bs, (bytes, bytearray)):
738 raise TypeError("quote_from_bytes() expected bytes")
739 if not bs:
740 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000741 if isinstance(safe, str):
742 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
743 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000744 else:
745 safe = bytes([c for c in safe if c < 128])
746 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
747 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000748 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000749 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000750 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000751 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
752 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000753
R David Murrayc17686f2015-05-17 20:44:50 -0400754def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
755 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700756 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000757
758 If any values in the query arg are sequences and doseq is true, each
759 sequence element is converted to a separate parameter.
760
761 If the query arg is a sequence of two-element tuples, the order of the
762 parameters in the output will match the order of parameters in the
763 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000764
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700765 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500766
R David Murrayc17686f2015-05-17 20:44:50 -0400767 The safe, encoding, and errors parameters are passed down to the function
768 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000769 """
770
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000771 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000772 query = query.items()
773 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000774 # It's a bother at times that strings and string-like objects are
775 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000776 try:
777 # non-sequence items should not work with len()
778 # non-empty strings will fail this
779 if len(query) and not isinstance(query[0], tuple):
780 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000781 # Zero-length sequences of all types will get here and succeed,
782 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000783 # allowed empty dicts that type of behavior probably should be
784 # preserved for consistency
785 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000786 ty, va, tb = sys.exc_info()
787 raise TypeError("not a valid non-string sequence "
788 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000789
790 l = []
791 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000792 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000793 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400794 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000795 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400796 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000797
798 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400799 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000800 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400801 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000802 l.append(k + '=' + v)
803 else:
804 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000805 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400806 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000807 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400808 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000809
810 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400811 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000812 l.append(k + '=' + v)
813 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400814 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000815 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000816 else:
817 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000818 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000819 x = len(v)
820 except TypeError:
821 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400822 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000823 l.append(k + '=' + v)
824 else:
825 # loop over the sequence
826 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000827 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400828 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000829 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400830 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000831 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000832 return '&'.join(l)
833
Georg Brandl13e89462008-07-01 19:56:00 +0000834def to_bytes(url):
835 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000836 # Most URL schemes require ASCII. If that changes, the conversion
837 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000838 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000839 if isinstance(url, str):
840 try:
841 url = url.encode("ASCII").decode()
842 except UnicodeError:
843 raise UnicodeError("URL " + repr(url) +
844 " contains non-ASCII characters")
845 return url
846
847def unwrap(url):
848 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
849 url = str(url).strip()
850 if url[:1] == '<' and url[-1:] == '>':
851 url = url[1:-1].strip()
852 if url[:4] == 'URL:': url = url[4:].strip()
853 return url
854
855_typeprog = None
856def splittype(url):
857 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
858 global _typeprog
859 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200860 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000861
862 match = _typeprog.match(url)
863 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200864 scheme, data = match.groups()
865 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000866 return None, url
867
868_hostprog = None
869def splithost(url):
870 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
871 global _hostprog
872 if _hostprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200873 _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000874
875 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000876 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200877 host_port, path = match.groups()
878 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +0000879 path = '/' + path
880 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000881 return None, url
882
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000883def splituser(host):
884 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200885 user, delim, host = host.rpartition('@')
886 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000887
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000888def splitpasswd(user):
889 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200890 user, delim, passwd = user.partition(':')
891 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000892
893# splittag('/path#tag') --> '/path', 'tag'
894_portprog = None
895def splitport(host):
896 """splitport('host:port') --> 'host', 'port'."""
897 global _portprog
898 if _portprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200899 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000900
901 match = _portprog.match(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200902 if match:
903 host, port = match.groups()
904 if port:
905 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000906 return host, None
907
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000908def splitnport(host, defport=-1):
909 """Split host and port, returning numeric port.
910 Return given default port if no ':' found; defaults to -1.
911 Return numerical port if a valid number are found after ':'.
912 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200913 host, delim, port = host.rpartition(':')
914 if not delim:
915 host = port
916 elif port:
917 try:
918 nport = int(port)
919 except ValueError:
920 nport = None
921 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000922 return host, defport
923
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000924def splitquery(url):
925 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200926 path, delim, query = url.rpartition('?')
927 if delim:
928 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000929 return url, None
930
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000931def splittag(url):
932 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200933 path, delim, tag = url.rpartition('#')
934 if delim:
935 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000936 return url, None
937
938def splitattr(url):
939 """splitattr('/path;attr1=value1;attr2=value2;...') ->
940 '/path', ['attr1=value1', 'attr2=value2', ...]."""
941 words = url.split(';')
942 return words[0], words[1:]
943
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000944def splitvalue(attr):
945 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200946 attr, delim, value = attr.partition('=')
947 return attr, (value if delim else None)