blob: 01c9e587fbcaeee2334c11e44a47889f8c161102 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000033
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000035 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Serhiy Storchaka15154502015-04-07 19:09:01 +030037 "unquote", "unquote_plus", "unquote_to_bytes",
38 "DefragResult", "ParseResult", "SplitResult",
39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000040
41# A classification of schemes ('' means apply by default)
42uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
43 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran2a157d22011-08-03 18:37:22 +080044 'prospero', 'rtsp', 'rtspu', '', 'sftp',
45 'svn', 'svn+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
47 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
48 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000049 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
51 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaraned301992012-12-24 14:00:20 -080052 'mms', '', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053
Georg Brandla61b09f2012-08-24 18:15:29 +020054# These are not actually used anymore, but should stay for backwards
55# compatibility. (They are undocumented, but have a public-looking name.)
56non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
57 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
58uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
59 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
60uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
61 'nntp', 'wais', 'https', 'shttp', 'snews',
62 'file', 'prospero', '']
63
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064# Characters valid in scheme names
65scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
66 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
67 '0123456789'
68 '+-.')
69
Nick Coghlan9fc443c2010-11-30 15:48:08 +000070# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000071MAX_CACHE_SIZE = 20
72_parse_cache = {}
73
74def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000075 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000076 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000077 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000078
79
Nick Coghlan9fc443c2010-11-30 15:48:08 +000080# Helpers for bytes handling
81# For 3.2, we deliberately require applications that
82# handle improperly quoted URLs to do their own
83# decoding and encoding. If valid use cases are
84# presented, we may relax this by using latin-1
85# decoding internally for 3.3
86_implicit_encoding = 'ascii'
87_implicit_errors = 'strict'
88
89def _noop(obj):
90 return obj
91
92def _encode_result(obj, encoding=_implicit_encoding,
93 errors=_implicit_errors):
94 return obj.encode(encoding, errors)
95
96def _decode_args(args, encoding=_implicit_encoding,
97 errors=_implicit_errors):
98 return tuple(x.decode(encoding, errors) if x else '' for x in args)
99
100def _coerce_args(*args):
101 # Invokes decode if necessary to create str args
102 # and returns the coerced inputs along with
103 # an appropriate result coercion function
104 # - noop for str inputs
105 # - encoding function otherwise
106 str_input = isinstance(args[0], str)
107 for arg in args[1:]:
108 # We special-case the empty string to support the
109 # "scheme=''" default argument to some functions
110 if arg and isinstance(arg, str) != str_input:
111 raise TypeError("Cannot mix str and non-str arguments")
112 if str_input:
113 return args + (_noop,)
114 return _decode_args(args) + (_encode_result,)
115
116# Result objects are more helpful than simple tuples
117class _ResultMixinStr(object):
118 """Standard approach to encoding parsed results from str to bytes"""
119 __slots__ = ()
120
121 def encode(self, encoding='ascii', errors='strict'):
122 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
123
124
125class _ResultMixinBytes(object):
126 """Standard approach to decoding parsed results from bytes to str"""
127 __slots__ = ()
128
129 def decode(self, encoding='ascii', errors='strict'):
130 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
131
132
133class _NetlocResultMixinBase(object):
134 """Shared methods for the parsed result objects containing a netloc element"""
135 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000136
137 @property
138 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000139 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000140
141 @property
142 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000143 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000144
145 @property
146 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000147 hostname = self._hostinfo[0]
148 if not hostname:
149 hostname = None
150 elif hostname is not None:
151 hostname = hostname.lower()
152 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000153
154 @property
155 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000156 port = self._hostinfo[1]
157 if port is not None:
158 port = int(port, 10)
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800159 # Return None on an illegal port
160 if not ( 0 <= port <= 65535):
161 return None
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000162 return port
163
164
165class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
166 __slots__ = ()
167
168 @property
169 def _userinfo(self):
170 netloc = self.netloc
171 userinfo, have_info, hostinfo = netloc.rpartition('@')
172 if have_info:
173 username, have_password, password = userinfo.partition(':')
174 if not have_password:
175 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000176 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000177 username = password = None
178 return username, password
179
180 @property
181 def _hostinfo(self):
182 netloc = self.netloc
183 _, _, hostinfo = netloc.rpartition('@')
184 _, have_open_br, bracketed = hostinfo.partition('[')
185 if have_open_br:
186 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200187 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000188 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200189 hostname, _, port = hostinfo.partition(':')
190 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000191 port = None
192 return hostname, port
193
194
195class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
196 __slots__ = ()
197
198 @property
199 def _userinfo(self):
200 netloc = self.netloc
201 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
202 if have_info:
203 username, have_password, password = userinfo.partition(b':')
204 if not have_password:
205 password = None
206 else:
207 username = password = None
208 return username, password
209
210 @property
211 def _hostinfo(self):
212 netloc = self.netloc
213 _, _, hostinfo = netloc.rpartition(b'@')
214 _, have_open_br, bracketed = hostinfo.partition(b'[')
215 if have_open_br:
216 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200217 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000218 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200219 hostname, _, port = hostinfo.partition(b':')
220 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000221 port = None
222 return hostname, port
223
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000224
225from collections import namedtuple
226
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000227_DefragResultBase = namedtuple('DefragResult', 'url fragment')
228_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
229_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000230
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000231# For backwards compatibility, alias _NetlocResultMixinStr
232# ResultBase is no longer part of the documented API, but it is
233# retained since deprecating it isn't worth the hassle
234ResultBase = _NetlocResultMixinStr
235
236# Structured result objects for string data
237class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000238 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000239 def geturl(self):
240 if self.fragment:
241 return self.url + '#' + self.fragment
242 else:
243 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000244
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000245class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
246 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000247 def geturl(self):
248 return urlunsplit(self)
249
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000250class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000251 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000252 def geturl(self):
253 return urlunparse(self)
254
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000255# Structured result objects for bytes data
256class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
257 __slots__ = ()
258 def geturl(self):
259 if self.fragment:
260 return self.url + b'#' + self.fragment
261 else:
262 return self.url
263
264class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
265 __slots__ = ()
266 def geturl(self):
267 return urlunsplit(self)
268
269class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
270 __slots__ = ()
271 def geturl(self):
272 return urlunparse(self)
273
274# Set up the encode/decode result pairs
275def _fix_result_transcoding():
276 _result_pairs = (
277 (DefragResult, DefragResultBytes),
278 (SplitResult, SplitResultBytes),
279 (ParseResult, ParseResultBytes),
280 )
281 for _decoded, _encoded in _result_pairs:
282 _decoded._encoded_counterpart = _encoded
283 _encoded._decoded_counterpart = _decoded
284
285_fix_result_transcoding()
286del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000287
288def urlparse(url, scheme='', allow_fragments=True):
289 """Parse a URL into 6 components:
290 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
291 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
292 Note that we don't break the components up in smaller bits
293 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000294 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700295 splitresult = urlsplit(url, scheme, allow_fragments)
296 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000297 if scheme in uses_params and ';' in url:
298 url, params = _splitparams(url)
299 else:
300 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000301 result = ParseResult(scheme, netloc, url, params, query, fragment)
302 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000303
304def _splitparams(url):
305 if '/' in url:
306 i = url.find(';', url.rfind('/'))
307 if i < 0:
308 return url, ''
309 else:
310 i = url.find(';')
311 return url[:i], url[i+1:]
312
313def _splitnetloc(url, start=0):
314 delim = len(url) # position of end of domain part of url, default is end
315 for c in '/?#': # look for delimiters; the order is NOT important
316 wdelim = url.find(c, start) # find first of this delim
317 if wdelim >= 0: # if found
318 delim = min(delim, wdelim) # use earliest delim position
319 return url[start:delim], url[delim:] # return (domain, rest)
320
321def urlsplit(url, scheme='', allow_fragments=True):
322 """Parse a URL into 5 components:
323 <scheme>://<netloc>/<path>?<query>#<fragment>
324 Return a 5-tuple: (scheme, netloc, path, query, fragment).
325 Note that we don't break the components up in smaller bits
326 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000327 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000328 allow_fragments = bool(allow_fragments)
329 key = url, scheme, allow_fragments, type(url), type(scheme)
330 cached = _parse_cache.get(key, None)
331 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000332 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000333 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
334 clear_cache()
335 netloc = query = fragment = ''
336 i = url.find(':')
337 if i > 0:
338 if url[:i] == 'http': # optimize the common case
339 scheme = url[:i].lower()
340 url = url[i+1:]
341 if url[:2] == '//':
342 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000343 if (('[' in netloc and ']' not in netloc) or
344 (']' in netloc and '[' not in netloc)):
345 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000346 if allow_fragments and '#' in url:
347 url, fragment = url.split('#', 1)
348 if '?' in url:
349 url, query = url.split('?', 1)
350 v = SplitResult(scheme, netloc, url, query, fragment)
351 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000352 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800353 for c in url[:i]:
354 if c not in scheme_chars:
355 break
356 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300357 # make sure "url" is not actually a port number (in which case
358 # "scheme" is really part of the path)
359 rest = url[i+1:]
360 if not rest or any(c not in '0123456789' for c in rest):
361 # not a port number
362 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800363
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000364 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000365 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000366 if (('[' in netloc and ']' not in netloc) or
367 (']' in netloc and '[' not in netloc)):
368 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800369 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000370 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800371 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000372 url, query = url.split('?', 1)
373 v = SplitResult(scheme, netloc, url, query, fragment)
374 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000375 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000376
377def urlunparse(components):
378 """Put a parsed URL back together again. This may result in a
379 slightly different, but equivalent URL, if the URL that was parsed
380 originally had redundant delimiters, e.g. a ? with an empty query
381 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000382 scheme, netloc, url, params, query, fragment, _coerce_result = (
383 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000384 if params:
385 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000386 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000387
388def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000389 """Combine the elements of a tuple as returned by urlsplit() into a
390 complete URL as a string. The data argument can be any five-item iterable.
391 This may result in a slightly different, but equivalent URL, if the URL that
392 was parsed originally had unnecessary delimiters (for example, a ? with an
393 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000394 scheme, netloc, url, query, fragment, _coerce_result = (
395 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000396 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
397 if url and url[:1] != '/': url = '/' + url
398 url = '//' + (netloc or '') + url
399 if scheme:
400 url = scheme + ':' + url
401 if query:
402 url = url + '?' + query
403 if fragment:
404 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000405 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000406
407def urljoin(base, url, allow_fragments=True):
408 """Join a base URL and a possibly relative URL to form an absolute
409 interpretation of the latter."""
410 if not base:
411 return url
412 if not url:
413 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400414
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000415 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000416 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
417 urlparse(base, '', allow_fragments)
418 scheme, netloc, path, params, query, fragment = \
419 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400420
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000421 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000422 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000423 if scheme in uses_netloc:
424 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000425 return _coerce_result(urlunparse((scheme, netloc, path,
426 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000427 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400428
Senthil Kumarandca5b862010-12-17 04:48:45 +0000429 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000430 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000431 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000432 if not query:
433 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000434 return _coerce_result(urlunparse((scheme, netloc, path,
435 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400436
437 base_parts = bpath.split('/')
438 if base_parts[-1] != '':
439 # the last item is not a directory, so will not be taken into account
440 # in resolving the relative path
441 del base_parts[-1]
442
443 # for rfc3986, ignore all base path should the first character be root.
444 if path[:1] == '/':
445 segments = path.split('/')
446 else:
447 segments = base_parts + path.split('/')
Senthil Kumarana66e3882014-09-22 15:49:16 +0800448 # filter out elements that would cause redundant slashes on re-joining
449 # the resolved_path
Berker Peksag20416f72015-04-16 02:31:14 +0300450 segments[1:-1] = filter(None, segments[1:-1])
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400451
452 resolved_path = []
453
454 for seg in segments:
455 if seg == '..':
456 try:
457 resolved_path.pop()
458 except IndexError:
459 # ignore any .. segments that would otherwise cause an IndexError
460 # when popped from resolved_path if resolving for rfc3986
461 pass
462 elif seg == '.':
463 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000464 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400465 resolved_path.append(seg)
466
467 if segments[-1] in ('.', '..'):
468 # do some post-processing here. if the last segment was a relative dir,
469 # then we need to append the trailing '/'
470 resolved_path.append('')
471
472 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
Senthil Kumarana66e3882014-09-22 15:49:16 +0800473 resolved_path) or '/', params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400474
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000475
476def urldefrag(url):
477 """Removes any existing fragment from URL.
478
479 Returns a tuple of the defragmented URL and the fragment. If
480 the URL contained no fragments, the second element is the
481 empty string.
482 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000483 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000484 if '#' in url:
485 s, n, p, a, q, frag = urlparse(url)
486 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000487 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000488 frag = ''
489 defrag = url
490 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000491
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200492_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100493_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200494
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000495def unquote_to_bytes(string):
496 """unquote_to_bytes('abc%20def') -> b'abc def'."""
497 # Note: strings are encoded as UTF-8. This is only an issue if it contains
498 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000499 if not string:
500 # Is it a string-like object?
501 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000502 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000503 if isinstance(string, str):
504 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200505 bits = string.split(b'%')
506 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000507 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200508 res = [bits[0]]
509 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100510 # Delay the initialization of the table to not waste memory
511 # if the function is never called
512 global _hextobyte
513 if _hextobyte is None:
514 _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
515 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200516 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000517 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200518 append(_hextobyte[item[:2]])
519 append(item[2:])
520 except KeyError:
521 append(b'%')
522 append(item)
523 return b''.join(res)
524
525_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000526
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000527def unquote(string, encoding='utf-8', errors='replace'):
528 """Replace %xx escapes by their single-character equivalent. The optional
529 encoding and errors parameters specify how to decode percent-encoded
530 sequences into Unicode characters, as accepted by the bytes.decode()
531 method.
532 By default, percent-encoded sequences are decoded with UTF-8, and invalid
533 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000534
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000535 unquote('abc%20def') -> 'abc def'.
536 """
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200537 if '%' not in string:
538 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000539 return string
540 if encoding is None:
541 encoding = 'utf-8'
542 if errors is None:
543 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200544 bits = _asciire.split(string)
545 res = [bits[0]]
546 append = res.append
547 for i in range(1, len(bits), 2):
548 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
549 append(bits[i + 1])
550 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000551
Victor Stinnerac71c542011-01-14 12:52:12 +0000552def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
553 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000554 """Parse a query given as a string argument.
555
556 Arguments:
557
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000558 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000559
560 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000561 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000562 A true value indicates that blanks should be retained as
563 blank strings. The default false value indicates that
564 blank values are to be ignored and treated as if they were
565 not included.
566
567 strict_parsing: flag indicating what to do with parsing errors.
568 If false (the default), errors are silently ignored.
569 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000570
571 encoding and errors: specify how to decode percent-encoded sequences
572 into Unicode characters, as accepted by the bytes.decode() method.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000573 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700574 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000575 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
576 encoding=encoding, errors=errors)
577 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700578 if name in parsed_result:
579 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000580 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700581 parsed_result[name] = [value]
582 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000583
Victor Stinnerac71c542011-01-14 12:52:12 +0000584def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
585 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000586 """Parse a query given as a string argument.
587
588 Arguments:
589
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000590 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000591
592 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000593 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000594 true value indicates that blanks should be retained as blank
595 strings. The default false value indicates that blank values
596 are to be ignored and treated as if they were not included.
597
598 strict_parsing: flag indicating what to do with parsing errors. If
599 false (the default), errors are silently ignored. If true,
600 errors raise a ValueError exception.
601
Victor Stinnerac71c542011-01-14 12:52:12 +0000602 encoding and errors: specify how to decode percent-encoded sequences
603 into Unicode characters, as accepted by the bytes.decode() method.
604
Facundo Batistac469d4c2008-09-03 22:49:01 +0000605 Returns a list, as G-d intended.
606 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000607 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000608 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
609 r = []
610 for name_value in pairs:
611 if not name_value and not strict_parsing:
612 continue
613 nv = name_value.split('=', 1)
614 if len(nv) != 2:
615 if strict_parsing:
616 raise ValueError("bad query field: %r" % (name_value,))
617 # Handle case of a control-name with no equal sign
618 if keep_blank_values:
619 nv.append('')
620 else:
621 continue
622 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000623 name = nv[0].replace('+', ' ')
624 name = unquote(name, encoding=encoding, errors=errors)
625 name = _coerce_result(name)
626 value = nv[1].replace('+', ' ')
627 value = unquote(value, encoding=encoding, errors=errors)
628 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000629 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000630 return r
631
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000632def unquote_plus(string, encoding='utf-8', errors='replace'):
633 """Like unquote(), but also replace plus signs by spaces, as required for
634 unquoting HTML form values.
635
636 unquote_plus('%7e/abc+def') -> '~/abc def'
637 """
638 string = string.replace('+', ' ')
639 return unquote(string, encoding, errors)
640
641_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
642 b'abcdefghijklmnopqrstuvwxyz'
643 b'0123456789'
644 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000645_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
646_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000647
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000648class Quoter(collections.defaultdict):
649 """A mapping from bytes (in range(0,256)) to strings.
650
651 String values are percent-encoded byte values, unless the key < 128, and
652 in the "safe" set (either the specified safe set, or default set).
653 """
654 # Keeps a cache internally, using defaultdict, for efficiency (lookups
655 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000656 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000657 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000658 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000659
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000660 def __repr__(self):
661 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300662 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000663
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000664 def __missing__(self, b):
665 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000666 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000667 self[b] = res
668 return res
669
670def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000671 """quote('abc def') -> 'abc%20def'
672
673 Each part of a URL, e.g. the path info, the query, etc., has a
674 different set of reserved characters that must be quoted.
675
676 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
677 the following reserved characters.
678
679 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
680 "$" | ","
681
682 Each of these characters is reserved in some component of a URL,
683 but not necessarily in all of them.
684
685 By default, the quote function is intended for quoting the path
686 section of a URL. Thus, it will not encode '/'. This character
687 is reserved, but in typical usage the quote function is being
688 called on a path where the existing slash characters are used as
689 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000690
R David Murray8c4e1122014-12-24 21:23:18 -0500691 string and safe may be either str or bytes objects. encoding and errors
692 must not be specified if string is a bytes object.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000693
694 The optional encoding and errors parameters specify how to deal with
695 non-ASCII characters, as accepted by the str.encode method.
696 By default, encoding='utf-8' (characters are encoded with UTF-8), and
697 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000698 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000699 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000700 if not string:
701 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000702 if encoding is None:
703 encoding = 'utf-8'
704 if errors is None:
705 errors = 'strict'
706 string = string.encode(encoding, errors)
707 else:
708 if encoding is not None:
709 raise TypeError("quote() doesn't support 'encoding' for bytes")
710 if errors is not None:
711 raise TypeError("quote() doesn't support 'errors' for bytes")
712 return quote_from_bytes(string, safe)
713
714def quote_plus(string, safe='', encoding=None, errors=None):
715 """Like quote(), but also replace ' ' with '+', as required for quoting
716 HTML form values. Plus signs in the original string are escaped unless
717 they are included in safe. It also does not have safe default to '/'.
718 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000719 # Check if ' ' in string, where string may either be a str or bytes. If
720 # there are no spaces, the regular quote will produce the right answer.
721 if ((isinstance(string, str) and ' ' not in string) or
722 (isinstance(string, bytes) and b' ' not in string)):
723 return quote(string, safe, encoding, errors)
724 if isinstance(safe, str):
725 space = ' '
726 else:
727 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000728 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000729 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000730
731def quote_from_bytes(bs, safe='/'):
732 """Like quote(), but accepts a bytes object rather than a str, and does
733 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800734 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000735 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000736 if not isinstance(bs, (bytes, bytearray)):
737 raise TypeError("quote_from_bytes() expected bytes")
738 if not bs:
739 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000740 if isinstance(safe, str):
741 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
742 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000743 else:
744 safe = bytes([c for c in safe if c < 128])
745 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
746 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000747 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000748 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000749 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000750 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
751 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000752
R David Murrayc17686f2015-05-17 20:44:50 -0400753def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
754 quote_via=quote_plus):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700755 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000756
757 If any values in the query arg are sequences and doseq is true, each
758 sequence element is converted to a separate parameter.
759
760 If the query arg is a sequence of two-element tuples, the order of the
761 parameters in the output will match the order of parameters in the
762 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000763
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700764 The components of a query arg may each be either a string or a bytes type.
R David Murray8c4e1122014-12-24 21:23:18 -0500765
R David Murrayc17686f2015-05-17 20:44:50 -0400766 The safe, encoding, and errors parameters are passed down to the function
767 specified by quote_via (encoding and errors only if a component is a str).
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000768 """
769
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000770 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000771 query = query.items()
772 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000773 # It's a bother at times that strings and string-like objects are
774 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000775 try:
776 # non-sequence items should not work with len()
777 # non-empty strings will fail this
778 if len(query) and not isinstance(query[0], tuple):
779 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000780 # Zero-length sequences of all types will get here and succeed,
781 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000782 # allowed empty dicts that type of behavior probably should be
783 # preserved for consistency
784 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000785 ty, va, tb = sys.exc_info()
786 raise TypeError("not a valid non-string sequence "
787 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000788
789 l = []
790 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000791 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000792 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400793 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000794 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400795 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000796
797 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400798 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000799 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400800 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000801 l.append(k + '=' + v)
802 else:
803 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000804 if isinstance(k, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400805 k = quote_via(k, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000806 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400807 k = quote_via(str(k), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000808
809 if isinstance(v, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400810 v = quote_via(v, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000811 l.append(k + '=' + v)
812 elif isinstance(v, str):
R David Murrayc17686f2015-05-17 20:44:50 -0400813 v = quote_via(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000814 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000815 else:
816 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000817 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000818 x = len(v)
819 except TypeError:
820 # not a sequence
R David Murrayc17686f2015-05-17 20:44:50 -0400821 v = quote_via(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000822 l.append(k + '=' + v)
823 else:
824 # loop over the sequence
825 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000826 if isinstance(elt, bytes):
R David Murrayc17686f2015-05-17 20:44:50 -0400827 elt = quote_via(elt, safe)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000828 else:
R David Murrayc17686f2015-05-17 20:44:50 -0400829 elt = quote_via(str(elt), safe, encoding, errors)
Senthil Kumarandf022da2010-07-03 17:48:22 +0000830 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000831 return '&'.join(l)
832
833# Utilities to parse URLs (most of these return None for missing parts):
834# unwrap('<URL:type://host/path>') --> 'type://host/path'
835# splittype('type:opaquestring') --> 'type', 'opaquestring'
836# splithost('//host[:port]/path') --> 'host[:port]', '/path'
837# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
838# splitpasswd('user:passwd') -> 'user', 'passwd'
839# splitport('host:port') --> 'host', 'port'
840# splitquery('/path?query') --> '/path', 'query'
841# splittag('/path#tag') --> '/path', 'tag'
842# splitattr('/path;attr1=value1;attr2=value2;...') ->
843# '/path', ['attr1=value1', 'attr2=value2', ...]
844# splitvalue('attr=value') --> 'attr', 'value'
845# urllib.parse.unquote('abc%20def') -> 'abc def'
846# quote('abc def') -> 'abc%20def')
847
Georg Brandl13e89462008-07-01 19:56:00 +0000848def to_bytes(url):
849 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000850 # Most URL schemes require ASCII. If that changes, the conversion
851 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000852 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000853 if isinstance(url, str):
854 try:
855 url = url.encode("ASCII").decode()
856 except UnicodeError:
857 raise UnicodeError("URL " + repr(url) +
858 " contains non-ASCII characters")
859 return url
860
861def unwrap(url):
862 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
863 url = str(url).strip()
864 if url[:1] == '<' and url[-1:] == '>':
865 url = url[1:-1].strip()
866 if url[:4] == 'URL:': url = url[4:].strip()
867 return url
868
869_typeprog = None
870def splittype(url):
871 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
872 global _typeprog
873 if _typeprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200874 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000875
876 match = _typeprog.match(url)
877 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200878 scheme, data = match.groups()
879 return scheme.lower(), data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000880 return None, url
881
882_hostprog = None
883def splithost(url):
884 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
885 global _hostprog
886 if _hostprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200887 _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000888
889 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000890 if match:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200891 host_port, path = match.groups()
892 if path and path[0] != '/':
Senthil Kumaranc2958622010-11-22 04:48:26 +0000893 path = '/' + path
894 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000895 return None, url
896
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000897def splituser(host):
898 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200899 user, delim, host = host.rpartition('@')
900 return (user if delim else None), host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000902def splitpasswd(user):
903 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200904 user, delim, passwd = user.partition(':')
905 return user, (passwd if delim else None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000906
907# splittag('/path#tag') --> '/path', 'tag'
908_portprog = None
909def splitport(host):
910 """splitport('host:port') --> 'host', 'port'."""
911 global _portprog
912 if _portprog is None:
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200913 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000914
915 match = _portprog.match(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200916 if match:
917 host, port = match.groups()
918 if port:
919 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000920 return host, None
921
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000922def splitnport(host, defport=-1):
923 """Split host and port, returning numeric port.
924 Return given default port if no ':' found; defaults to -1.
925 Return numerical port if a valid number are found after ':'.
926 Return None if ':' but not a valid number."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200927 host, delim, port = host.rpartition(':')
928 if not delim:
929 host = port
930 elif port:
931 try:
932 nport = int(port)
933 except ValueError:
934 nport = None
935 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000936 return host, defport
937
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000938def splitquery(url):
939 """splitquery('/path?query') --> '/path', 'query'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200940 path, delim, query = url.rpartition('?')
941 if delim:
942 return path, query
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000943 return url, None
944
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000945def splittag(url):
946 """splittag('/path#tag') --> '/path', 'tag'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200947 path, delim, tag = url.rpartition('#')
948 if delim:
949 return path, tag
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000950 return url, None
951
952def splitattr(url):
953 """splitattr('/path;attr1=value1;attr2=value2;...') ->
954 '/path', ['attr1=value1', 'attr2=value2', ...]."""
955 words = url.split(';')
956 return words[0], words[1:]
957
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000958def splitvalue(attr):
959 """splitvalue('attr=value') --> 'attr', 'value'."""
Serhiy Storchaka44eceb62015-03-03 20:21:35 +0200960 attr, delim, value = attr.partition('=')
961 return attr, (value if delim else None)