blob: b6ac414dfdfb5e0da036641f25583461c278f86c [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Serhiy Storchaka8ea46162013-03-14 21:31:37 +020030import re
Facundo Batista2ac5de22008-07-07 18:24:11 +000031import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000033
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000035 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000037 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000038
39# A classification of schemes ('' means apply by default)
40uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
41 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran2a157d22011-08-03 18:37:22 +080042 'prospero', 'rtsp', 'rtspu', '', 'sftp',
43 'svn', 'svn+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000044uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
45 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
46 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000047 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000048uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
49 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaraned301992012-12-24 14:00:20 -080050 'mms', '', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051
Georg Brandla61b09f2012-08-24 18:15:29 +020052# These are not actually used anymore, but should stay for backwards
53# compatibility. (They are undocumented, but have a public-looking name.)
54non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
55 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
56uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
57 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
58uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
59 'nntp', 'wais', 'https', 'shttp', 'snews',
60 'file', 'prospero', '']
61
Jeremy Hylton1afc1692008-06-18 20:49:58 +000062# Characters valid in scheme names
63scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
64 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
65 '0123456789'
66 '+-.')
67
Nick Coghlan9fc443c2010-11-30 15:48:08 +000068# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000069MAX_CACHE_SIZE = 20
70_parse_cache = {}
71
72def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000073 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000074 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000075 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000076
77
Nick Coghlan9fc443c2010-11-30 15:48:08 +000078# Helpers for bytes handling
79# For 3.2, we deliberately require applications that
80# handle improperly quoted URLs to do their own
81# decoding and encoding. If valid use cases are
82# presented, we may relax this by using latin-1
83# decoding internally for 3.3
84_implicit_encoding = 'ascii'
85_implicit_errors = 'strict'
86
87def _noop(obj):
88 return obj
89
90def _encode_result(obj, encoding=_implicit_encoding,
91 errors=_implicit_errors):
92 return obj.encode(encoding, errors)
93
94def _decode_args(args, encoding=_implicit_encoding,
95 errors=_implicit_errors):
96 return tuple(x.decode(encoding, errors) if x else '' for x in args)
97
98def _coerce_args(*args):
99 # Invokes decode if necessary to create str args
100 # and returns the coerced inputs along with
101 # an appropriate result coercion function
102 # - noop for str inputs
103 # - encoding function otherwise
104 str_input = isinstance(args[0], str)
105 for arg in args[1:]:
106 # We special-case the empty string to support the
107 # "scheme=''" default argument to some functions
108 if arg and isinstance(arg, str) != str_input:
109 raise TypeError("Cannot mix str and non-str arguments")
110 if str_input:
111 return args + (_noop,)
112 return _decode_args(args) + (_encode_result,)
113
114# Result objects are more helpful than simple tuples
115class _ResultMixinStr(object):
116 """Standard approach to encoding parsed results from str to bytes"""
117 __slots__ = ()
118
119 def encode(self, encoding='ascii', errors='strict'):
120 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
121
122
123class _ResultMixinBytes(object):
124 """Standard approach to decoding parsed results from bytes to str"""
125 __slots__ = ()
126
127 def decode(self, encoding='ascii', errors='strict'):
128 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
129
130
131class _NetlocResultMixinBase(object):
132 """Shared methods for the parsed result objects containing a netloc element"""
133 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000134
135 @property
136 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000137 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000138
139 @property
140 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000141 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000142
143 @property
144 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000145 hostname = self._hostinfo[0]
146 if not hostname:
147 hostname = None
148 elif hostname is not None:
149 hostname = hostname.lower()
150 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000151
152 @property
153 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000154 port = self._hostinfo[1]
155 if port is not None:
156 port = int(port, 10)
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800157 # Return None on an illegal port
158 if not ( 0 <= port <= 65535):
159 return None
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000160 return port
161
162
163class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
164 __slots__ = ()
165
166 @property
167 def _userinfo(self):
168 netloc = self.netloc
169 userinfo, have_info, hostinfo = netloc.rpartition('@')
170 if have_info:
171 username, have_password, password = userinfo.partition(':')
172 if not have_password:
173 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000174 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000175 username = password = None
176 return username, password
177
178 @property
179 def _hostinfo(self):
180 netloc = self.netloc
181 _, _, hostinfo = netloc.rpartition('@')
182 _, have_open_br, bracketed = hostinfo.partition('[')
183 if have_open_br:
184 hostname, _, port = bracketed.partition(']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200185 _, _, port = port.partition(':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000186 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200187 hostname, _, port = hostinfo.partition(':')
188 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000189 port = None
190 return hostname, port
191
192
193class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
194 __slots__ = ()
195
196 @property
197 def _userinfo(self):
198 netloc = self.netloc
199 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
200 if have_info:
201 username, have_password, password = userinfo.partition(b':')
202 if not have_password:
203 password = None
204 else:
205 username = password = None
206 return username, password
207
208 @property
209 def _hostinfo(self):
210 netloc = self.netloc
211 _, _, hostinfo = netloc.rpartition(b'@')
212 _, have_open_br, bracketed = hostinfo.partition(b'[')
213 if have_open_br:
214 hostname, _, port = bracketed.partition(b']')
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200215 _, _, port = port.partition(b':')
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000216 else:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200217 hostname, _, port = hostinfo.partition(b':')
218 if not port:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000219 port = None
220 return hostname, port
221
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000222
223from collections import namedtuple
224
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000225_DefragResultBase = namedtuple('DefragResult', 'url fragment')
226_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
227_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000228
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000229# For backwards compatibility, alias _NetlocResultMixinStr
230# ResultBase is no longer part of the documented API, but it is
231# retained since deprecating it isn't worth the hassle
232ResultBase = _NetlocResultMixinStr
233
234# Structured result objects for string data
235class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000237 def geturl(self):
238 if self.fragment:
239 return self.url + '#' + self.fragment
240 else:
241 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000242
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000243class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
244 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000245 def geturl(self):
246 return urlunsplit(self)
247
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000248class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000249 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000250 def geturl(self):
251 return urlunparse(self)
252
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000253# Structured result objects for bytes data
254class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
255 __slots__ = ()
256 def geturl(self):
257 if self.fragment:
258 return self.url + b'#' + self.fragment
259 else:
260 return self.url
261
262class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
263 __slots__ = ()
264 def geturl(self):
265 return urlunsplit(self)
266
267class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
268 __slots__ = ()
269 def geturl(self):
270 return urlunparse(self)
271
272# Set up the encode/decode result pairs
273def _fix_result_transcoding():
274 _result_pairs = (
275 (DefragResult, DefragResultBytes),
276 (SplitResult, SplitResultBytes),
277 (ParseResult, ParseResultBytes),
278 )
279 for _decoded, _encoded in _result_pairs:
280 _decoded._encoded_counterpart = _encoded
281 _encoded._decoded_counterpart = _decoded
282
283_fix_result_transcoding()
284del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000285
286def urlparse(url, scheme='', allow_fragments=True):
287 """Parse a URL into 6 components:
288 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
289 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
290 Note that we don't break the components up in smaller bits
291 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000292 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700293 splitresult = urlsplit(url, scheme, allow_fragments)
294 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000295 if scheme in uses_params and ';' in url:
296 url, params = _splitparams(url)
297 else:
298 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000299 result = ParseResult(scheme, netloc, url, params, query, fragment)
300 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000301
302def _splitparams(url):
303 if '/' in url:
304 i = url.find(';', url.rfind('/'))
305 if i < 0:
306 return url, ''
307 else:
308 i = url.find(';')
309 return url[:i], url[i+1:]
310
311def _splitnetloc(url, start=0):
312 delim = len(url) # position of end of domain part of url, default is end
313 for c in '/?#': # look for delimiters; the order is NOT important
314 wdelim = url.find(c, start) # find first of this delim
315 if wdelim >= 0: # if found
316 delim = min(delim, wdelim) # use earliest delim position
317 return url[start:delim], url[delim:] # return (domain, rest)
318
319def urlsplit(url, scheme='', allow_fragments=True):
320 """Parse a URL into 5 components:
321 <scheme>://<netloc>/<path>?<query>#<fragment>
322 Return a 5-tuple: (scheme, netloc, path, query, fragment).
323 Note that we don't break the components up in smaller bits
324 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000325 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000326 allow_fragments = bool(allow_fragments)
327 key = url, scheme, allow_fragments, type(url), type(scheme)
328 cached = _parse_cache.get(key, None)
329 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000330 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000331 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
332 clear_cache()
333 netloc = query = fragment = ''
334 i = url.find(':')
335 if i > 0:
336 if url[:i] == 'http': # optimize the common case
337 scheme = url[:i].lower()
338 url = url[i+1:]
339 if url[:2] == '//':
340 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000341 if (('[' in netloc and ']' not in netloc) or
342 (']' in netloc and '[' not in netloc)):
343 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000344 if allow_fragments and '#' in url:
345 url, fragment = url.split('#', 1)
346 if '?' in url:
347 url, query = url.split('?', 1)
348 v = SplitResult(scheme, netloc, url, query, fragment)
349 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000350 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800351 for c in url[:i]:
352 if c not in scheme_chars:
353 break
354 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300355 # make sure "url" is not actually a port number (in which case
356 # "scheme" is really part of the path)
357 rest = url[i+1:]
358 if not rest or any(c not in '0123456789' for c in rest):
359 # not a port number
360 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800361
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000362 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000363 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000364 if (('[' in netloc and ']' not in netloc) or
365 (']' in netloc and '[' not in netloc)):
366 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800367 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000368 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800369 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000370 url, query = url.split('?', 1)
371 v = SplitResult(scheme, netloc, url, query, fragment)
372 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000373 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000374
375def urlunparse(components):
376 """Put a parsed URL back together again. This may result in a
377 slightly different, but equivalent URL, if the URL that was parsed
378 originally had redundant delimiters, e.g. a ? with an empty query
379 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000380 scheme, netloc, url, params, query, fragment, _coerce_result = (
381 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000382 if params:
383 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000384 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000385
386def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000387 """Combine the elements of a tuple as returned by urlsplit() into a
388 complete URL as a string. The data argument can be any five-item iterable.
389 This may result in a slightly different, but equivalent URL, if the URL that
390 was parsed originally had unnecessary delimiters (for example, a ? with an
391 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000392 scheme, netloc, url, query, fragment, _coerce_result = (
393 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000394 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
395 if url and url[:1] != '/': url = '/' + url
396 url = '//' + (netloc or '') + url
397 if scheme:
398 url = scheme + ':' + url
399 if query:
400 url = url + '?' + query
401 if fragment:
402 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000403 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000404
405def urljoin(base, url, allow_fragments=True):
406 """Join a base URL and a possibly relative URL to form an absolute
407 interpretation of the latter."""
408 if not base:
409 return url
410 if not url:
411 return base
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400412
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000413 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000414 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
415 urlparse(base, '', allow_fragments)
416 scheme, netloc, path, params, query, fragment = \
417 urlparse(url, bscheme, allow_fragments)
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400418
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000419 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000420 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000421 if scheme in uses_netloc:
422 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000423 return _coerce_result(urlunparse((scheme, netloc, path,
424 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000425 netloc = bnetloc
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400426
Senthil Kumarandca5b862010-12-17 04:48:45 +0000427 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000428 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000429 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000430 if not query:
431 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000432 return _coerce_result(urlunparse((scheme, netloc, path,
433 params, query, fragment)))
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400434
435 base_parts = bpath.split('/')
436 if base_parts[-1] != '':
437 # the last item is not a directory, so will not be taken into account
438 # in resolving the relative path
439 del base_parts[-1]
440
441 # for rfc3986, ignore all base path should the first character be root.
442 if path[:1] == '/':
443 segments = path.split('/')
444 else:
445 segments = base_parts + path.split('/')
446
447 resolved_path = []
448
449 for seg in segments:
450 if seg == '..':
451 try:
452 resolved_path.pop()
453 except IndexError:
454 # ignore any .. segments that would otherwise cause an IndexError
455 # when popped from resolved_path if resolving for rfc3986
456 pass
457 elif seg == '.':
458 continue
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000459 else:
Antoine Pitrou55ac5b32014-08-21 19:16:17 -0400460 resolved_path.append(seg)
461
462 if segments[-1] in ('.', '..'):
463 # do some post-processing here. if the last segment was a relative dir,
464 # then we need to append the trailing '/'
465 resolved_path.append('')
466
467 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
468 resolved_path), params, query, fragment)))
469
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000470
471def urldefrag(url):
472 """Removes any existing fragment from URL.
473
474 Returns a tuple of the defragmented URL and the fragment. If
475 the URL contained no fragments, the second element is the
476 empty string.
477 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000478 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000479 if '#' in url:
480 s, n, p, a, q, frag = urlparse(url)
481 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000482 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000483 frag = ''
484 defrag = url
485 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000486
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200487_hexdig = '0123456789ABCDEFabcdef'
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100488_hextobyte = None
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200489
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000490def unquote_to_bytes(string):
491 """unquote_to_bytes('abc%20def') -> b'abc def'."""
492 # Note: strings are encoded as UTF-8. This is only an issue if it contains
493 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000494 if not string:
495 # Is it a string-like object?
496 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000497 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000498 if isinstance(string, str):
499 string = string.encode('utf-8')
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200500 bits = string.split(b'%')
501 if len(bits) == 1:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000502 return string
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200503 res = [bits[0]]
504 append = res.append
Victor Stinnerd6a91a72014-03-17 22:38:41 +0100505 # Delay the initialization of the table to not waste memory
506 # if the function is never called
507 global _hextobyte
508 if _hextobyte is None:
509 _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
510 for a in _hexdig for b in _hexdig}
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200511 for item in bits[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000512 try:
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200513 append(_hextobyte[item[:2]])
514 append(item[2:])
515 except KeyError:
516 append(b'%')
517 append(item)
518 return b''.join(res)
519
520_asciire = re.compile('([\x00-\x7f]+)')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000521
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000522def unquote(string, encoding='utf-8', errors='replace'):
523 """Replace %xx escapes by their single-character equivalent. The optional
524 encoding and errors parameters specify how to decode percent-encoded
525 sequences into Unicode characters, as accepted by the bytes.decode()
526 method.
527 By default, percent-encoded sequences are decoded with UTF-8, and invalid
528 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000529
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000530 unquote('abc%20def') -> 'abc def'.
531 """
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200532 if '%' not in string:
533 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000534 return string
535 if encoding is None:
536 encoding = 'utf-8'
537 if errors is None:
538 errors = 'replace'
Serhiy Storchaka8ea46162013-03-14 21:31:37 +0200539 bits = _asciire.split(string)
540 res = [bits[0]]
541 append = res.append
542 for i in range(1, len(bits), 2):
543 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
544 append(bits[i + 1])
545 return ''.join(res)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000546
Victor Stinnerac71c542011-01-14 12:52:12 +0000547def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
548 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000549 """Parse a query given as a string argument.
550
551 Arguments:
552
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000553 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000554
555 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000556 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000557 A true value indicates that blanks should be retained as
558 blank strings. The default false value indicates that
559 blank values are to be ignored and treated as if they were
560 not included.
561
562 strict_parsing: flag indicating what to do with parsing errors.
563 If false (the default), errors are silently ignored.
564 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000565
566 encoding and errors: specify how to decode percent-encoded sequences
567 into Unicode characters, as accepted by the bytes.decode() method.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000568 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700569 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000570 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
571 encoding=encoding, errors=errors)
572 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700573 if name in parsed_result:
574 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000575 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700576 parsed_result[name] = [value]
577 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000578
Victor Stinnerac71c542011-01-14 12:52:12 +0000579def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
580 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000581 """Parse a query given as a string argument.
582
583 Arguments:
584
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000585 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000586
587 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000588 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000589 true value indicates that blanks should be retained as blank
590 strings. The default false value indicates that blank values
591 are to be ignored and treated as if they were not included.
592
593 strict_parsing: flag indicating what to do with parsing errors. If
594 false (the default), errors are silently ignored. If true,
595 errors raise a ValueError exception.
596
Victor Stinnerac71c542011-01-14 12:52:12 +0000597 encoding and errors: specify how to decode percent-encoded sequences
598 into Unicode characters, as accepted by the bytes.decode() method.
599
Facundo Batistac469d4c2008-09-03 22:49:01 +0000600 Returns a list, as G-d intended.
601 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000602 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000603 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
604 r = []
605 for name_value in pairs:
606 if not name_value and not strict_parsing:
607 continue
608 nv = name_value.split('=', 1)
609 if len(nv) != 2:
610 if strict_parsing:
611 raise ValueError("bad query field: %r" % (name_value,))
612 # Handle case of a control-name with no equal sign
613 if keep_blank_values:
614 nv.append('')
615 else:
616 continue
617 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000618 name = nv[0].replace('+', ' ')
619 name = unquote(name, encoding=encoding, errors=errors)
620 name = _coerce_result(name)
621 value = nv[1].replace('+', ' ')
622 value = unquote(value, encoding=encoding, errors=errors)
623 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000624 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000625 return r
626
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000627def unquote_plus(string, encoding='utf-8', errors='replace'):
628 """Like unquote(), but also replace plus signs by spaces, as required for
629 unquoting HTML form values.
630
631 unquote_plus('%7e/abc+def') -> '~/abc def'
632 """
633 string = string.replace('+', ' ')
634 return unquote(string, encoding, errors)
635
636_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
637 b'abcdefghijklmnopqrstuvwxyz'
638 b'0123456789'
639 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000640_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
641_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000642
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000643class Quoter(collections.defaultdict):
644 """A mapping from bytes (in range(0,256)) to strings.
645
646 String values are percent-encoded byte values, unless the key < 128, and
647 in the "safe" set (either the specified safe set, or default set).
648 """
649 # Keeps a cache internally, using defaultdict, for efficiency (lookups
650 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000651 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000652 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000653 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000654
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000655 def __repr__(self):
656 # Without this, will just display as a defaultdict
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300657 return "<%s %r>" % (self.__class__.__name__, dict(self))
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000658
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000659 def __missing__(self, b):
660 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000661 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000662 self[b] = res
663 return res
664
665def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000666 """quote('abc def') -> 'abc%20def'
667
668 Each part of a URL, e.g. the path info, the query, etc., has a
669 different set of reserved characters that must be quoted.
670
671 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
672 the following reserved characters.
673
674 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
675 "$" | ","
676
677 Each of these characters is reserved in some component of a URL,
678 but not necessarily in all of them.
679
680 By default, the quote function is intended for quoting the path
681 section of a URL. Thus, it will not encode '/'. This character
682 is reserved, but in typical usage the quote function is being
683 called on a path where the existing slash characters are used as
684 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000685
686 string and safe may be either str or bytes objects. encoding must
687 not be specified if string is a str.
688
689 The optional encoding and errors parameters specify how to deal with
690 non-ASCII characters, as accepted by the str.encode method.
691 By default, encoding='utf-8' (characters are encoded with UTF-8), and
692 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000693 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000694 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000695 if not string:
696 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000697 if encoding is None:
698 encoding = 'utf-8'
699 if errors is None:
700 errors = 'strict'
701 string = string.encode(encoding, errors)
702 else:
703 if encoding is not None:
704 raise TypeError("quote() doesn't support 'encoding' for bytes")
705 if errors is not None:
706 raise TypeError("quote() doesn't support 'errors' for bytes")
707 return quote_from_bytes(string, safe)
708
709def quote_plus(string, safe='', encoding=None, errors=None):
710 """Like quote(), but also replace ' ' with '+', as required for quoting
711 HTML form values. Plus signs in the original string are escaped unless
712 they are included in safe. It also does not have safe default to '/'.
713 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000714 # Check if ' ' in string, where string may either be a str or bytes. If
715 # there are no spaces, the regular quote will produce the right answer.
716 if ((isinstance(string, str) and ' ' not in string) or
717 (isinstance(string, bytes) and b' ' not in string)):
718 return quote(string, safe, encoding, errors)
719 if isinstance(safe, str):
720 space = ' '
721 else:
722 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000723 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000724 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000725
726def quote_from_bytes(bs, safe='/'):
727 """Like quote(), but accepts a bytes object rather than a str, and does
728 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800729 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000730 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000731 if not isinstance(bs, (bytes, bytearray)):
732 raise TypeError("quote_from_bytes() expected bytes")
733 if not bs:
734 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000735 if isinstance(safe, str):
736 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
737 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000738 else:
739 safe = bytes([c for c in safe if c < 128])
740 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
741 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000742 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000743 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000744 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000745 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
746 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000747
Senthil Kumarandf022da2010-07-03 17:48:22 +0000748def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700749 """Encode a dict or sequence of two-element tuples into a URL query string.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000750
751 If any values in the query arg are sequences and doseq is true, each
752 sequence element is converted to a separate parameter.
753
754 If the query arg is a sequence of two-element tuples, the order of the
755 parameters in the output will match the order of parameters in the
756 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000757
Senthil Kumaran324ae3852013-09-05 21:42:38 -0700758 The components of a query arg may each be either a string or a bytes type.
759 When a component is a string, the safe, encoding and error parameters are
760 sent to the quote_plus function for encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000761 """
762
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000763 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000764 query = query.items()
765 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000766 # It's a bother at times that strings and string-like objects are
767 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000768 try:
769 # non-sequence items should not work with len()
770 # non-empty strings will fail this
771 if len(query) and not isinstance(query[0], tuple):
772 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000773 # Zero-length sequences of all types will get here and succeed,
774 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000775 # allowed empty dicts that type of behavior probably should be
776 # preserved for consistency
777 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000778 ty, va, tb = sys.exc_info()
779 raise TypeError("not a valid non-string sequence "
780 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000781
782 l = []
783 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000784 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000785 if isinstance(k, bytes):
786 k = quote_plus(k, safe)
787 else:
788 k = quote_plus(str(k), safe, encoding, errors)
789
790 if isinstance(v, bytes):
791 v = quote_plus(v, safe)
792 else:
793 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000794 l.append(k + '=' + v)
795 else:
796 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000797 if isinstance(k, bytes):
798 k = quote_plus(k, safe)
799 else:
800 k = quote_plus(str(k), safe, encoding, errors)
801
802 if isinstance(v, bytes):
803 v = quote_plus(v, safe)
804 l.append(k + '=' + v)
805 elif isinstance(v, str):
806 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000807 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000808 else:
809 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000810 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000811 x = len(v)
812 except TypeError:
813 # not a sequence
Senthil Kumarandf022da2010-07-03 17:48:22 +0000814 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000815 l.append(k + '=' + v)
816 else:
817 # loop over the sequence
818 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000819 if isinstance(elt, bytes):
820 elt = quote_plus(elt, safe)
821 else:
822 elt = quote_plus(str(elt), safe, encoding, errors)
823 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000824 return '&'.join(l)
825
826# Utilities to parse URLs (most of these return None for missing parts):
827# unwrap('<URL:type://host/path>') --> 'type://host/path'
828# splittype('type:opaquestring') --> 'type', 'opaquestring'
829# splithost('//host[:port]/path') --> 'host[:port]', '/path'
830# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
831# splitpasswd('user:passwd') -> 'user', 'passwd'
832# splitport('host:port') --> 'host', 'port'
833# splitquery('/path?query') --> '/path', 'query'
834# splittag('/path#tag') --> '/path', 'tag'
835# splitattr('/path;attr1=value1;attr2=value2;...') ->
836# '/path', ['attr1=value1', 'attr2=value2', ...]
837# splitvalue('attr=value') --> 'attr', 'value'
838# urllib.parse.unquote('abc%20def') -> 'abc def'
839# quote('abc def') -> 'abc%20def')
840
Georg Brandl13e89462008-07-01 19:56:00 +0000841def to_bytes(url):
842 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000843 # Most URL schemes require ASCII. If that changes, the conversion
844 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000845 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000846 if isinstance(url, str):
847 try:
848 url = url.encode("ASCII").decode()
849 except UnicodeError:
850 raise UnicodeError("URL " + repr(url) +
851 " contains non-ASCII characters")
852 return url
853
854def unwrap(url):
855 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
856 url = str(url).strip()
857 if url[:1] == '<' and url[-1:] == '>':
858 url = url[1:-1].strip()
859 if url[:4] == 'URL:': url = url[4:].strip()
860 return url
861
862_typeprog = None
863def splittype(url):
864 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
865 global _typeprog
866 if _typeprog is None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000867 _typeprog = re.compile('^([^/:]+):')
868
869 match = _typeprog.match(url)
870 if match:
871 scheme = match.group(1)
872 return scheme.lower(), url[len(scheme) + 1:]
873 return None, url
874
875_hostprog = None
876def splithost(url):
877 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
878 global _hostprog
879 if _hostprog is None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000880 _hostprog = re.compile('^//([^/?]*)(.*)$')
881
882 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000883 if match:
884 host_port = match.group(1)
885 path = match.group(2)
886 if path and not path.startswith('/'):
887 path = '/' + path
888 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000889 return None, url
890
891_userprog = None
892def splituser(host):
893 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
894 global _userprog
895 if _userprog is None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000896 _userprog = re.compile('^(.*)@(.*)$')
897
898 match = _userprog.match(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +0000899 if match: return match.group(1, 2)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000900 return None, host
901
902_passwdprog = None
903def splitpasswd(user):
904 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
905 global _passwdprog
906 if _passwdprog is None:
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000907 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000908
909 match = _passwdprog.match(user)
910 if match: return match.group(1, 2)
911 return user, None
912
913# splittag('/path#tag') --> '/path', 'tag'
914_portprog = None
915def splitport(host):
916 """splitport('host:port') --> 'host', 'port'."""
917 global _portprog
918 if _portprog is None:
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200919 _portprog = re.compile('^(.*):([0-9]*)$')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000920
921 match = _portprog.match(host)
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200922 if match:
923 host, port = match.groups()
924 if port:
925 return host, port
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000926 return host, None
927
928_nportprog = None
929def splitnport(host, defport=-1):
930 """Split host and port, returning numeric port.
931 Return given default port if no ':' found; defaults to -1.
932 Return numerical port if a valid number are found after ':'.
933 Return None if ':' but not a valid number."""
934 global _nportprog
935 if _nportprog is None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000936 _nportprog = re.compile('^(.*):(.*)$')
937
938 match = _nportprog.match(host)
939 if match:
940 host, port = match.group(1, 2)
Serhiy Storchakaff97b082014-01-18 18:30:33 +0200941 if port:
942 try:
943 nport = int(port)
944 except ValueError:
945 nport = None
946 return host, nport
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000947 return host, defport
948
949_queryprog = None
950def splitquery(url):
951 """splitquery('/path?query') --> '/path', 'query'."""
952 global _queryprog
953 if _queryprog is None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000954 _queryprog = re.compile('^(.*)\?([^?]*)$')
955
956 match = _queryprog.match(url)
957 if match: return match.group(1, 2)
958 return url, None
959
960_tagprog = None
961def splittag(url):
962 """splittag('/path#tag') --> '/path', 'tag'."""
963 global _tagprog
964 if _tagprog is None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000965 _tagprog = re.compile('^(.*)#([^#]*)$')
966
967 match = _tagprog.match(url)
968 if match: return match.group(1, 2)
969 return url, None
970
971def splitattr(url):
972 """splitattr('/path;attr1=value1;attr2=value2;...') ->
973 '/path', ['attr1=value1', 'attr2=value2', ...]."""
974 words = url.split(';')
975 return words[0], words[1:]
976
977_valueprog = None
978def splitvalue(attr):
979 """splitvalue('attr=value') --> 'attr', 'value'."""
980 global _valueprog
981 if _valueprog is None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000982 _valueprog = re.compile('^([^=]*)=(.*)$')
983
984 match = _valueprog.match(attr)
985 if match: return match.group(1, 2)
986 return attr, None