blob: 5e2155ccafc595e57243a3c73d8741c9f2006a7c [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000033
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000035 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030037 "unquote", "unquote_plus", "unquote_to_bytes",
38 "DefragResult", "ParseResult", "SplitResult",
39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000040
41# A classification of schemes ('' means apply by default)
42uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
43 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran2a157d22011-08-03 18:37:22 +080044 'prospero', 'rtsp', 'rtspu', '', 'sftp',
45 'svn', 'svn+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
47 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
48 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000049 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
51 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaraned301992012-12-24 14:00:20 -080052 'mms', '', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053
Georg Brandla61b09f2012-08-24 18:15:29 +020054# These are not actually used anymore, but should stay for backwards
55# compatibility. (They are undocumented, but have a public-looking name.)
56non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
57 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
58uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
59 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
60uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
61 'nntp', 'wais', 'https', 'shttp', 'snews',
62 'file', 'prospero', '']
63
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064# Characters valid in scheme names
65scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
66 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
67 '0123456789'
68 '+-.')
69
Nick Coghlan9fc443c2010-11-30 15:48:08 +000070# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000071MAX_CACHE_SIZE = 20
72_parse_cache = {}
73
74def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000075 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000076 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000077 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000078
79
Nick Coghlan9fc443c2010-11-30 15:48:08 +000080# Helpers for bytes handling
81# For 3.2, we deliberately require applications that
82# handle improperly quoted URLs to do their own
83# decoding and encoding. If valid use cases are
84# presented, we may relax this by using latin-1
85# decoding internally for 3.3
86_implicit_encoding = 'ascii'
87_implicit_errors = 'strict'
88
89def _noop(obj):
90 return obj
91
92def _encode_result(obj, encoding=_implicit_encoding,
93 errors=_implicit_errors):
94 return obj.encode(encoding, errors)
95
96def _decode_args(args, encoding=_implicit_encoding,
97 errors=_implicit_errors):
98 return tuple(x.decode(encoding, errors) if x else '' for x in args)
99
100def _coerce_args(*args):
101 # Invokes decode if necessary to create str args
102 # and returns the coerced inputs along with
103 # an appropriate result coercion function
104 # - noop for str inputs
105 # - encoding function otherwise
106 str_input = isinstance(args[0], str)
107 for arg in args[1:]:
108 # We special-case the empty string to support the
109 # "scheme=''" default argument to some functions
110 if arg and isinstance(arg, str) != str_input:
111 raise TypeError("Cannot mix str and non-str arguments")
112 if str_input:
113 return args + (_noop,)
114 return _decode_args(args) + (_encode_result,)
115
116# Result objects are more helpful than simple tuples
117class _ResultMixinStr(object):
118 """Standard approach to encoding parsed results from str to bytes"""
119 __slots__ = ()
120
121 def encode(self, encoding='ascii', errors='strict'):
122 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
123
124
125class _ResultMixinBytes(object):
126 """Standard approach to decoding parsed results from bytes to str"""
127 __slots__ = ()
128
129 def decode(self, encoding='ascii', errors='strict'):
130 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
131
132
133class _NetlocResultMixinBase(object):
134 """Shared methods for the parsed result objects containing a netloc element"""
135 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000136
137 @property
138 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000139 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000140
141 @property
142 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000143 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000144
145 @property
146 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000147 hostname = self._hostinfo[0]
148 if not hostname:
149 hostname = None
150 elif hostname is not None:
151 hostname = hostname.lower()
152 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000153
154 @property
155 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000156 port = self._hostinfo[1]
157 if port is not None:
158 port = int(port, 10)
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800159 if not ( 0 <= port <= 65535):
Robert Collinsdfa95c92015-08-10 09:53:30 +1200160 raise ValueError("Port out of range 0-65535")
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000161 return port
162
163
164class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
165 __slots__ = ()
166
167 @property
168 def _userinfo(self):
169 netloc = self.netloc
170 userinfo, have_info, hostinfo = netloc.rpartition('@')
171 if have_info:
172 username, have_password, password = userinfo.partition(':')
173 if not have_password:
174 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000175 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000176 username = password = None
177 return username, password
178
179 @property
180 def _hostinfo(self):
181 netloc = self.netloc
182 _, _, hostinfo = netloc.rpartition('@')
183 _, have_open_br, bracketed = hostinfo.partition('[')
184 if have_open_br:
185 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200186 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000187 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200188 hostname, _, port = hostinfo.partition(':')
189 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000190 port = None
191 return hostname, port
192
193
194class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
195 __slots__ = ()
196
197 @property
198 def _userinfo(self):
199 netloc = self.netloc
200 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
201 if have_info:
202 username, have_password, password = userinfo.partition(b':')
203 if not have_password:
204 password = None
205 else:
206 username = password = None
207 return username, password
208
209 @property
210 def _hostinfo(self):
211 netloc = self.netloc
212 _, _, hostinfo = netloc.rpartition(b'@')
213 _, have_open_br, bracketed = hostinfo.partition(b'[')
214 if have_open_br:
215 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200216 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000217 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200218 hostname, _, port = hostinfo.partition(b':')
219 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000220 port = None
221 return hostname, port
222
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000223
224from collections import namedtuple
225
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000226_DefragResultBase = namedtuple('DefragResult', 'url fragment')
227_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
228_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000229
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000230# For backwards compatibility, alias _NetlocResultMixinStr
231# ResultBase is no longer part of the documented API, but it is
232# retained since deprecating it isn't worth the hassle
233ResultBase = _NetlocResultMixinStr
234
235# Structured result objects for string data
236class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000237 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000238 def geturl(self):
239 if self.fragment:
240 return self.url + '#' + self.fragment
241 else:
242 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000243
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000244class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
245 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000246 def geturl(self):
247 return urlunsplit(self)
248
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000249class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000250 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000251 def geturl(self):
252 return urlunparse(self)
253
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000254# Structured result objects for bytes data
255class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
256 __slots__ = ()
257 def geturl(self):
258 if self.fragment:
259 return self.url + b'#' + self.fragment
260 else:
261 return self.url
262
263class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
264 __slots__ = ()
265 def geturl(self):
266 return urlunsplit(self)
267
268class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
269 __slots__ = ()
270 def geturl(self):
271 return urlunparse(self)
272
273# Set up the encode/decode result pairs
274def _fix_result_transcoding():
275 _result_pairs = (
276 (DefragResult, DefragResultBytes),
277 (SplitResult, SplitResultBytes),
278 (ParseResult, ParseResultBytes),
279 )
280 for _decoded, _encoded in _result_pairs:
281 _decoded._encoded_counterpart = _encoded
282 _encoded._decoded_counterpart = _decoded
283
284_fix_result_transcoding()
285del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000286
287def urlparse(url, scheme='', allow_fragments=True):
288 """Parse a URL into 6 components:
289 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
290 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
291 Note that we don't break the components up in smaller bits
292 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000293 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700294 splitresult = urlsplit(url, scheme, allow_fragments)
295 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000296 if scheme in uses_params and ';' in url:
297 url, params = _splitparams(url)
298 else:
299 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000300 result = ParseResult(scheme, netloc, url, params, query, fragment)
301 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000302
303def _splitparams(url):
304 if '/' in url:
305 i = url.find(';', url.rfind('/'))
306 if i < 0:
307 return url, ''
308 else:
309 i = url.find(';')
310 return url[:i], url[i+1:]
311
312def _splitnetloc(url, start=0):
313 delim = len(url) # position of end of domain part of url, default is end
314 for c in '/?#': # look for delimiters; the order is NOT important
315 wdelim = url.find(c, start) # find first of this delim
316 if wdelim >= 0: # if found
317 delim = min(delim, wdelim) # use earliest delim position
318 return url[start:delim], url[delim:] # return (domain, rest)
319
320def urlsplit(url, scheme='', allow_fragments=True):
321 """Parse a URL into 5 components:
322 <scheme>://<netloc>/<path>?<query>#<fragment>
323 Return a 5-tuple: (scheme, netloc, path, query, fragment).
324 Note that we don't break the components up in smaller bits
325 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000326 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000327 allow_fragments = bool(allow_fragments)
328 key = url, scheme, allow_fragments, type(url), type(scheme)
329 cached = _parse_cache.get(key, None)
330 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000331 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000332 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
333 clear_cache()
334 netloc = query = fragment = ''
335 i = url.find(':')
336 if i > 0:
337 if url[:i] == 'http': # optimize the common case
338 scheme = url[:i].lower()
339 url = url[i+1:]
340 if url[:2] == '//':
341 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000342 if (('[' in netloc and ']' not in netloc) or
343 (']' in netloc and '[' not in netloc)):
344 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000345 if allow_fragments and '#' in url:
346 url, fragment = url.split('#', 1)
347 if '?' in url:
348 url, query = url.split('?', 1)
349 v = SplitResult(scheme, netloc, url, query, fragment)
350 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000351 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800352 for c in url[:i]:
353 if c not in scheme_chars:
354 break
355 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300356 # make sure "url" is not actually a port number (in which case
357 # "scheme" is really part of the path)
358 rest = url[i+1:]
359 if not rest or any(c not in '0123456789' for c in rest):
360 # not a port number
361 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800362
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000363 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000364 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000365 if (('[' in netloc and ']' not in netloc) or
366 (']' in netloc and '[' not in netloc)):
367 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800368 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000369 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800370 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000371 url, query = url.split('?', 1)
372 v = SplitResult(scheme, netloc, url, query, fragment)
373 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000374 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000375
376def urlunparse(components):
377 """Put a parsed URL back together again. This may result in a
378 slightly different, but equivalent URL, if the URL that was parsed
379 originally had redundant delimiters, e.g. a ? with an empty query
380 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000381 scheme, netloc, url, params, query, fragment, _coerce_result = (
382 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000383 if params:
384 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000385 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000386
387def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000388 """Combine the elements of a tuple as returned by urlsplit() into a
389 complete URL as a string. The data argument can be any five-item iterable.
390 This may result in a slightly different, but equivalent URL, if the URL that
391 was parsed originally had unnecessary delimiters (for example, a ? with an
392 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000393 scheme, netloc, url, query, fragment, _coerce_result = (
394 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000395 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
396 if url and url[:1] != '/': url = '/' + url
397 url = '//' + (netloc or '') + url
398 if scheme:
399 url = scheme + ':' + url
400 if query:
401 url = url + '?' + query
402 if fragment:
403 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000404 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000405
406def urljoin(base, url, allow_fragments=True):
407 """Join a base URL and a possibly relative URL to form an absolute
408 interpretation of the latter."""
409 if not base:
410 return url
411 if not url:
412 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400413
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000414 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000415 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
416 urlparse(base, '', allow_fragments)
417 scheme, netloc, path, params, query, fragment = \
418 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400419
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000420 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000421 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000422 if scheme in uses_netloc:
423 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000424 return _coerce_result(urlunparse((scheme, netloc, path,
425 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000426 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400427
Senthil Kumarandca5b862010-12-17 04:48:45 +0000428 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000429 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000430 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000431 if not query:
432 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000433 return _coerce_result(urlunparse((scheme, netloc, path,
434 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400435
436 base_parts = bpath.split('/')
437 if base_parts[-1] != '':
438 # the last item is not a directory, so will not be taken into account
439 # in resolving the relative path
440 del base_parts[-1]
441
442 # for rfc3986, ignore all base path should the first character be root.
443 if path[:1] == '/':
444 segments = path.split('/')
445 else:
446 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800447 # filter out elements that would cause redundant slashes on re-joining
448 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300449 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400450
451 resolved_path = []
452
453 for seg in segments:
454 if seg == '..':
455 try:
456 resolved_path.pop()
457 except IndexError:
458 # ignore any .. segments that would otherwise cause an IndexError
459 # when popped from resolved_path if resolving for rfc3986
460 pass
461 elif seg == '.':
462 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000463 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400464 resolved_path.append(seg)
465
466 if segments[-1] in ('.', '..'):
467 # do some post-processing here. if the last segment was a relative dir,
468 # then we need to append the trailing '/'
469 resolved_path.append('')
470
471 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800472 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400473
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000474
475def urldefrag(url):
476 """Removes any existing fragment from URL.
477
478 Returns a tuple of the defragmented URL and the fragment. If
479 the URL contained no fragments, the second element is the
480 empty string.
481 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000482 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000483 if '#' in url:
484 s, n, p, a, q, frag = urlparse(url)
485 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000486 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000487 frag = ''
488 defrag = url
489 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000490
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200491_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100492_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200493
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000494def unquote_to_bytes(string):
495 """unquote_to_bytes('abc%20def') -> b'abc def'."""
496 # Note: strings are encoded as UTF-8. This is only an issue if it contains
497 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000498 if not string:
499 # Is it a string-like object?
500 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000501 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000502 if isinstance(string, str):
503 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200504 bits = string.split(b'%')
505 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000506 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200507 res = [bits[0]]
508 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100509 # Delay the initialization of the table to not waste memory
510 # if the function is never called
511 global _hextobyte
512 if _hextobyte is None:
513 _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
514 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200515 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000516 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200517 append(_hextobyte[item[:2]])
518 append(item[2:])
519 except KeyError:
520 append(b'%')
521 append(item)
522 return b''.join(res)
523
524_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000525
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000526def unquote(string, encoding='utf-8', errors='replace'):
527 """Replace %xx escapes by their single-character equivalent. The optional
528 encoding and errors parameters specify how to decode percent-encoded
529 sequences into Unicode characters, as accepted by the bytes.decode()
530 method.
531 By default, percent-encoded sequences are decoded with UTF-8, and invalid
532 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000533
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000534 unquote('abc%20def') -> 'abc def'.
535 """
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200536 if '%' not in string:
537 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000538 return string
539 if encoding is None:
540 encoding = 'utf-8'
541 if errors is None:
542 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200543 bits = _asciire.split(string)
544 res = [bits[0]]
545 append = res.append
546 for i in range(1, len(bits), 2):
547 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
548 append(bits[i + 1])
549 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000550
Victor Stinnerac71c542011-01-14 12:52:12 +0000551def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
552 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000553 """Parse a query given as a string argument.
554
555 Arguments:
556
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000557 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000558
559 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000560 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000561 A true value indicates that blanks should be retained as
562 blank strings. The default false value indicates that
563 blank values are to be ignored and treated as if they were
564 not included.
565
566 strict_parsing: flag indicating what to do with parsing errors.
567 If false (the default), errors are silently ignored.
568 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000569
570 encoding and errors: specify how to decode percent-encoded sequences
571 into Unicode characters, as accepted by the bytes.decode() method.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000572 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700573 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000574 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
575 encoding=encoding, errors=errors)
576 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700577 if name in parsed_result:
578 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000579 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700580 parsed_result[name] = [value]
581 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000582
Victor Stinnerac71c542011-01-14 12:52:12 +0000583def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
584 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000585 """Parse a query given as a string argument.
586
587 Arguments:
588
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000589 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000590
591 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000592 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000593 true value indicates that blanks should be retained as blank
594 strings. The default false value indicates that blank values
595 are to be ignored and treated as if they were not included.
596
597 strict_parsing: flag indicating what to do with parsing errors. If
598 false (the default), errors are silently ignored. If true,
599 errors raise a ValueError exception.
600
Victor Stinnerac71c542011-01-14 12:52:12 +0000601 encoding and errors: specify how to decode percent-encoded sequences
602 into Unicode characters, as accepted by the bytes.decode() method.
603
Facundo Batistac469d4c2008-09-03 22:49:01 +0000604 Returns a list, as G-d intended.
605 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000606 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000607 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
608 r = []
609 for name_value in pairs:
610 if not name_value and not strict_parsing:
611 continue
612 nv = name_value.split('=', 1)
613 if len(nv) != 2:
614 if strict_parsing:
615 raise ValueError("bad query field: %r" % (name_value,))
616 # Handle case of a control-name with no equal sign
617 if keep_blank_values:
618 nv.append('')
619 else:
620 continue
621 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000622 name = nv[0].replace('+', ' ')
623 name = unquote(name, encoding=encoding, errors=errors)
624 name = _coerce_result(name)
625 value = nv[1].replace('+', ' ')
626 value = unquote(value, encoding=encoding, errors=errors)
627 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000628 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000629 return r
630
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000631def unquote_plus(string, encoding='utf-8', errors='replace'):
632 """Like unquote(), but also replace plus signs by spaces, as required for
633 unquoting HTML form values.
634
635 unquote_plus('%7e/abc+def') -> '~/abc def'
636 """
637 string = string.replace('+', ' ')
638 return unquote(string, encoding, errors)
639
640_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
641 b'abcdefghijklmnopqrstuvwxyz'
642 b'0123456789'
643 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000644_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
645_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000646
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000647class Quoter(collections.defaultdict):
648 """A mapping from bytes (in range(0,256)) to strings.
649
650 String values are percent-encoded byte values, unless the key < 128, and
651 in the "safe" set (either the specified safe set, or default set).
652 """
653 # Keeps a cache internally, using defaultdict, for efficiency (lookups
654 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000655 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000656 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000657 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000658
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000659 def __repr__(self):
660 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300661 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000662
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000663 def __missing__(self, b):
664 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000665 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000666 self[b] = res
667 return res
668
669def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000670 """quote('abc def') -> 'abc%20def'
671
672 Each part of a URL, e.g. the path info, the query, etc., has a
673 different set of reserved characters that must be quoted.
674
675 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
676 the following reserved characters.
677
678 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
679 "$" | ","
680
681 Each of these characters is reserved in some component of a URL,
682 but not necessarily in all of them.
683
684 By default, the quote function is intended for quoting the path
685 section of a URL. Thus, it will not encode '/'. This character
686 is reserved, but in typical usage the quote function is being
687 called on a path where the existing slash characters are used as
688 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000689
R David Murray8c4e1122014-12-24 21:23:18 -0500690 string and safe may be either str or bytes objects. encoding and errors
691 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000692
693 The optional encoding and errors parameters specify how to deal with
694 non-ASCII characters, as accepted by the str.encode method.
695 By default, encoding='utf-8' (characters are encoded with UTF-8), and
696 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000697 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000698 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000699 if not string:
700 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000701 if encoding is None:
702 encoding = 'utf-8'
703 if errors is None:
704 errors = 'strict'
705 string = string.encode(encoding, errors)
706 else:
707 if encoding is not None:
708 raise TypeError("quote() doesn't support 'encoding' for bytes")
709 if errors is not None:
710 raise TypeError("quote() doesn't support 'errors' for bytes")
711 return quote_from_bytes(string, safe)
712
713def quote_plus(string, safe='', encoding=None, errors=None):
714 """Like quote(), but also replace ' ' with '+', as required for quoting
715 HTML form values. Plus signs in the original string are escaped unless
716 they are included in safe. It also does not have safe default to '/'.
717 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000718 # Check if ' ' in string, where string may either be a str or bytes. If
719 # there are no spaces, the regular quote will produce the right answer.
720 if ((isinstance(string, str) and ' ' not in string) or
721 (isinstance(string, bytes) and b' ' not in string)):
722 return quote(string, safe, encoding, errors)
723 if isinstance(safe, str):
724 space = ' '
725 else:
726 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000727 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000728 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000729
730def quote_from_bytes(bs, safe='/'):
731 """Like quote(), but accepts a bytes object rather than a str, and does
732 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800733 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000734 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000735 if not isinstance(bs, (bytes, bytearray)):
736 raise TypeError("quote_from_bytes() expected bytes")
737 if not bs:
738 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000739 if isinstance(safe, str):
740 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
741 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000742 else:
743 safe = bytes([c for c in safe if c < 128])
744 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
745 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000746 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000747 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000748 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000749 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
750 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000751
R David Murrayc17686f2015-05-17 20:44:50 -0400752def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
753 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700754 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000755
756 If any values in the query arg are sequences and doseq is true, each
757 sequence element is converted to a separate parameter.
758
759 If the query arg is a sequence of two-element tuples, the order of the
760 parameters in the output will match the order of parameters in the
761 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000762
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700763 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500764
R David Murrayc17686f2015-05-17 20:44:50 -0400765 The safe, encoding, and errors parameters are passed down to the function
766 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000767 """
768
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000769 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000770 query = query.items()
771 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000772 # It's a bother at times that strings and string-like objects are
773 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000774 try:
775 # non-sequence items should not work with len()
776 # non-empty strings will fail this
777 if len(query) and not isinstance(query[0], tuple):
778 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000779 # Zero-length sequences of all types will get here and succeed,
780 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000781 # allowed empty dicts that type of behavior probably should be
782 # preserved for consistency
783 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000784 ty, va, tb = sys.exc_info()
785 raise TypeError("not a valid non-string sequence "
786 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000787
788 l = []
789 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000790 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000791 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400792 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000793 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400794 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000795
796 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400797 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000798 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400799 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000800 l.append(k + '=' + v)
801 else:
802 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000803 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400804 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000805 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400806 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000807
808 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400809 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000810 l.append(k + '=' + v)
811 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400812 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000814 else:
815 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000816 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000817 x = len(v)
818 except TypeError:
819 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400820 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000821 l.append(k + '=' + v)
822 else:
823 # loop over the sequence
824 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000825 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400826 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000827 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400828 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000829 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000830 return '&'.join(l)
831
832# Utilities to parse URLs (most of these return None for missing parts):
833# unwrap('<URL:type://host/path>') --> 'type://host/path'
834# splittype('type:opaquestring') --> 'type', 'opaquestring'
835# splithost('//host[:port]/path') --> 'host[:port]', '/path'
836# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
837# splitpasswd('user:passwd') -> 'user', 'passwd'
838# splitport('host:port') --> 'host', 'port'
839# splitquery('/path?query') --> '/path', 'query'
840# splittag('/path#tag') --> '/path', 'tag'
841# splitattr('/path;attr1=value1;attr2=value2;...') ->
842# '/path', ['attr1=value1', 'attr2=value2', ...]
843# splitvalue('attr=value') --> 'attr', 'value'
844# urllib.parse.unquote('abc%20def') -> 'abc def'
845# quote('abc def') -> 'abc%20def')
846
Georg Brandl13e89462008-07-01 19:56:00 +0000847def to_bytes(url):
848 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000849 # Most URL schemes require ASCII. If that changes, the conversion
850 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000851 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000852 if isinstance(url, str):
853 try:
854 url = url.encode("ASCII").decode()
855 except UnicodeError:
856 raise UnicodeError("URL " + repr(url) +
857 " contains non-ASCII characters")
858 return url
859
860def unwrap(url):
861 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
862 url = str(url).strip()
863 if url[:1] == '<' and url[-1:] == '>':
864 url = url[1:-1].strip()
865 if url[:4] == 'URL:': url = url[4:].strip()
866 return url
867
868_typeprog = None
869def splittype(url):
870 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
871 global _typeprog
872 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200873 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000874
875 match = _typeprog.match(url)
876 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200877 scheme, data = match.groups()
878 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000879 return None, url
880
881_hostprog = None
882def splithost(url):
883 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
884 global _hostprog
885 if _hostprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200886 _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000887
888 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000889 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200890 host_port, path = match.groups()
891 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +0000892 path = '/' + path
893 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000894 return None, url
895
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000896def splituser(host):
897 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200898 user, delim, host = host.rpartition('@')
899 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000900
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901def splitpasswd(user):
902 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200903 user, delim, passwd = user.partition(':')
904 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000905
906# splittag('/path#tag') --> '/path', 'tag'
907_portprog = None
908def splitport(host):
909 """splitport('host:port') --> 'host', 'port'."""
910 global _portprog
911 if _portprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200912 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000913
914 match = _portprog.match(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200915 if match:
916 host, port = match.groups()
917 if port:
918 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000919 return host, None
920
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000921def splitnport(host, defport=-1):
922 """Split host and port, returning numeric port.
923 Return given default port if no ':' found; defaults to -1.
924 Return numerical port if a valid number are found after ':'.
925 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200926 host, delim, port = host.rpartition(':')
927 if not delim:
928 host = port
929 elif port:
930 try:
931 nport = int(port)
932 except ValueError:
933 nport = None
934 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000935 return host, defport
936
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000937def splitquery(url):
938 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200939 path, delim, query = url.rpartition('?')
940 if delim:
941 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000942 return url, None
943
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000944def splittag(url):
945 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200946 path, delim, tag = url.rpartition('#')
947 if delim:
948 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000949 return url, None
950
951def splitattr(url):
952 """splitattr('/path;attr1=value1;attr2=value2;...') ->
953 '/path', ['attr1=value1', 'attr2=value2', ...]."""
954 words = url.split(';')
955 return words[0], words[1:]
956
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000957def splitvalue(attr):
958 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200959 attr, delim, value = attr.partition('=')
960 return attr, (value if delim else None)