blob: c0995dc010eb591636327a23b49cfa30d5fcdf34 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000034 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
35 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000036 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran2a157d22011-08-03 18:37:22 +080041 'prospero', 'rtsp', 'rtspu', '', 'sftp',
42 'svn', 'svn+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000043uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
44 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
45 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000046 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000047uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
48 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
49 'mms', '', 'sftp']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# Characters valid in scheme names
52scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
53 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
54 '0123456789'
55 '+-.')
56
Nick Coghlan9fc443c2010-11-30 15:48:08 +000057# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000058MAX_CACHE_SIZE = 20
59_parse_cache = {}
60
61def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000062 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000064 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000065
66
Nick Coghlan9fc443c2010-11-30 15:48:08 +000067# Helpers for bytes handling
68# For 3.2, we deliberately require applications that
69# handle improperly quoted URLs to do their own
70# decoding and encoding. If valid use cases are
71# presented, we may relax this by using latin-1
72# decoding internally for 3.3
73_implicit_encoding = 'ascii'
74_implicit_errors = 'strict'
75
76def _noop(obj):
77 return obj
78
79def _encode_result(obj, encoding=_implicit_encoding,
80 errors=_implicit_errors):
81 return obj.encode(encoding, errors)
82
83def _decode_args(args, encoding=_implicit_encoding,
84 errors=_implicit_errors):
85 return tuple(x.decode(encoding, errors) if x else '' for x in args)
86
87def _coerce_args(*args):
88 # Invokes decode if necessary to create str args
89 # and returns the coerced inputs along with
90 # an appropriate result coercion function
91 # - noop for str inputs
92 # - encoding function otherwise
93 str_input = isinstance(args[0], str)
94 for arg in args[1:]:
95 # We special-case the empty string to support the
96 # "scheme=''" default argument to some functions
97 if arg and isinstance(arg, str) != str_input:
98 raise TypeError("Cannot mix str and non-str arguments")
99 if str_input:
100 return args + (_noop,)
101 return _decode_args(args) + (_encode_result,)
102
103# Result objects are more helpful than simple tuples
104class _ResultMixinStr(object):
105 """Standard approach to encoding parsed results from str to bytes"""
106 __slots__ = ()
107
108 def encode(self, encoding='ascii', errors='strict'):
109 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
110
111
112class _ResultMixinBytes(object):
113 """Standard approach to decoding parsed results from bytes to str"""
114 __slots__ = ()
115
116 def decode(self, encoding='ascii', errors='strict'):
117 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
118
119
120class _NetlocResultMixinBase(object):
121 """Shared methods for the parsed result objects containing a netloc element"""
122 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000123
124 @property
125 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000126 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000127
128 @property
129 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000130 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000131
132 @property
133 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000134 hostname = self._hostinfo[0]
135 if not hostname:
136 hostname = None
137 elif hostname is not None:
138 hostname = hostname.lower()
139 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000140
141 @property
142 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000143 port = self._hostinfo[1]
144 if port is not None:
145 port = int(port, 10)
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800146 # Return None on an illegal port
147 if not ( 0 <= port <= 65535):
148 return None
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000149 return port
150
151
152class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
153 __slots__ = ()
154
155 @property
156 def _userinfo(self):
157 netloc = self.netloc
158 userinfo, have_info, hostinfo = netloc.rpartition('@')
159 if have_info:
160 username, have_password, password = userinfo.partition(':')
161 if not have_password:
162 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000163 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000164 username = password = None
165 return username, password
166
167 @property
168 def _hostinfo(self):
169 netloc = self.netloc
170 _, _, hostinfo = netloc.rpartition('@')
171 _, have_open_br, bracketed = hostinfo.partition('[')
172 if have_open_br:
173 hostname, _, port = bracketed.partition(']')
174 _, have_port, port = port.partition(':')
175 else:
176 hostname, have_port, port = hostinfo.partition(':')
177 if not have_port:
178 port = None
179 return hostname, port
180
181
182class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
183 __slots__ = ()
184
185 @property
186 def _userinfo(self):
187 netloc = self.netloc
188 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
189 if have_info:
190 username, have_password, password = userinfo.partition(b':')
191 if not have_password:
192 password = None
193 else:
194 username = password = None
195 return username, password
196
197 @property
198 def _hostinfo(self):
199 netloc = self.netloc
200 _, _, hostinfo = netloc.rpartition(b'@')
201 _, have_open_br, bracketed = hostinfo.partition(b'[')
202 if have_open_br:
203 hostname, _, port = bracketed.partition(b']')
204 _, have_port, port = port.partition(b':')
205 else:
206 hostname, have_port, port = hostinfo.partition(b':')
207 if not have_port:
208 port = None
209 return hostname, port
210
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211
212from collections import namedtuple
213
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000214_DefragResultBase = namedtuple('DefragResult', 'url fragment')
215_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
216_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000217
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000218# For backwards compatibility, alias _NetlocResultMixinStr
219# ResultBase is no longer part of the documented API, but it is
220# retained since deprecating it isn't worth the hassle
221ResultBase = _NetlocResultMixinStr
222
223# Structured result objects for string data
224class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000225 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000226 def geturl(self):
227 if self.fragment:
228 return self.url + '#' + self.fragment
229 else:
230 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000231
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000232class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
233 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000234 def geturl(self):
235 return urlunsplit(self)
236
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000237class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000238 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000239 def geturl(self):
240 return urlunparse(self)
241
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000242# Structured result objects for bytes data
243class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
244 __slots__ = ()
245 def geturl(self):
246 if self.fragment:
247 return self.url + b'#' + self.fragment
248 else:
249 return self.url
250
251class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
252 __slots__ = ()
253 def geturl(self):
254 return urlunsplit(self)
255
256class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
257 __slots__ = ()
258 def geturl(self):
259 return urlunparse(self)
260
261# Set up the encode/decode result pairs
262def _fix_result_transcoding():
263 _result_pairs = (
264 (DefragResult, DefragResultBytes),
265 (SplitResult, SplitResultBytes),
266 (ParseResult, ParseResultBytes),
267 )
268 for _decoded, _encoded in _result_pairs:
269 _decoded._encoded_counterpart = _encoded
270 _encoded._decoded_counterpart = _decoded
271
272_fix_result_transcoding()
273del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000274
275def urlparse(url, scheme='', allow_fragments=True):
276 """Parse a URL into 6 components:
277 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
278 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
279 Note that we don't break the components up in smaller bits
280 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000281 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000282 tuple = urlsplit(url, scheme, allow_fragments)
283 scheme, netloc, url, query, fragment = tuple
284 if scheme in uses_params and ';' in url:
285 url, params = _splitparams(url)
286 else:
287 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000288 result = ParseResult(scheme, netloc, url, params, query, fragment)
289 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000290
291def _splitparams(url):
292 if '/' in url:
293 i = url.find(';', url.rfind('/'))
294 if i < 0:
295 return url, ''
296 else:
297 i = url.find(';')
298 return url[:i], url[i+1:]
299
300def _splitnetloc(url, start=0):
301 delim = len(url) # position of end of domain part of url, default is end
302 for c in '/?#': # look for delimiters; the order is NOT important
303 wdelim = url.find(c, start) # find first of this delim
304 if wdelim >= 0: # if found
305 delim = min(delim, wdelim) # use earliest delim position
306 return url[start:delim], url[delim:] # return (domain, rest)
307
308def urlsplit(url, scheme='', allow_fragments=True):
309 """Parse a URL into 5 components:
310 <scheme>://<netloc>/<path>?<query>#<fragment>
311 Return a 5-tuple: (scheme, netloc, path, query, fragment).
312 Note that we don't break the components up in smaller bits
313 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000314 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000315 allow_fragments = bool(allow_fragments)
316 key = url, scheme, allow_fragments, type(url), type(scheme)
317 cached = _parse_cache.get(key, None)
318 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000319 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000320 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
321 clear_cache()
322 netloc = query = fragment = ''
323 i = url.find(':')
324 if i > 0:
325 if url[:i] == 'http': # optimize the common case
326 scheme = url[:i].lower()
327 url = url[i+1:]
328 if url[:2] == '//':
329 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000330 if (('[' in netloc and ']' not in netloc) or
331 (']' in netloc and '[' not in netloc)):
332 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000333 if allow_fragments and '#' in url:
334 url, fragment = url.split('#', 1)
335 if '?' in url:
336 url, query = url.split('?', 1)
337 v = SplitResult(scheme, netloc, url, query, fragment)
338 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000339 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800340 for c in url[:i]:
341 if c not in scheme_chars:
342 break
343 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300344 # make sure "url" is not actually a port number (in which case
345 # "scheme" is really part of the path)
346 rest = url[i+1:]
347 if not rest or any(c not in '0123456789' for c in rest):
348 # not a port number
349 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800350
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000351 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000352 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000353 if (('[' in netloc and ']' not in netloc) or
354 (']' in netloc and '[' not in netloc)):
355 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800356 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000357 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800358 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359 url, query = url.split('?', 1)
360 v = SplitResult(scheme, netloc, url, query, fragment)
361 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000362 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000363
364def urlunparse(components):
365 """Put a parsed URL back together again. This may result in a
366 slightly different, but equivalent URL, if the URL that was parsed
367 originally had redundant delimiters, e.g. a ? with an empty query
368 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000369 scheme, netloc, url, params, query, fragment, _coerce_result = (
370 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000371 if params:
372 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000373 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000374
375def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000376 """Combine the elements of a tuple as returned by urlsplit() into a
377 complete URL as a string. The data argument can be any five-item iterable.
378 This may result in a slightly different, but equivalent URL, if the URL that
379 was parsed originally had unnecessary delimiters (for example, a ? with an
380 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000381 scheme, netloc, url, query, fragment, _coerce_result = (
382 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000383 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
384 if url and url[:1] != '/': url = '/' + url
385 url = '//' + (netloc or '') + url
386 if scheme:
387 url = scheme + ':' + url
388 if query:
389 url = url + '?' + query
390 if fragment:
391 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000392 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000393
394def urljoin(base, url, allow_fragments=True):
395 """Join a base URL and a possibly relative URL to form an absolute
396 interpretation of the latter."""
397 if not base:
398 return url
399 if not url:
400 return base
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000401 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000402 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
403 urlparse(base, '', allow_fragments)
404 scheme, netloc, path, params, query, fragment = \
405 urlparse(url, bscheme, allow_fragments)
406 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000407 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000408 if scheme in uses_netloc:
409 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000410 return _coerce_result(urlunparse((scheme, netloc, path,
411 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000412 netloc = bnetloc
413 if path[:1] == '/':
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000414 return _coerce_result(urlunparse((scheme, netloc, path,
415 params, query, fragment)))
Senthil Kumarandca5b862010-12-17 04:48:45 +0000416 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000417 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000418 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000419 if not query:
420 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000421 return _coerce_result(urlunparse((scheme, netloc, path,
422 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000423 segments = bpath.split('/')[:-1] + path.split('/')
424 # XXX The stuff below is bogus in various ways...
425 if segments[-1] == '.':
426 segments[-1] = ''
427 while '.' in segments:
428 segments.remove('.')
429 while 1:
430 i = 1
431 n = len(segments) - 1
432 while i < n:
433 if (segments[i] == '..'
434 and segments[i-1] not in ('', '..')):
435 del segments[i-1:i+1]
436 break
437 i = i+1
438 else:
439 break
440 if segments == ['', '..']:
441 segments[-1] = ''
442 elif len(segments) >= 2 and segments[-1] == '..':
443 segments[-2:] = ['']
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000444 return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),
445 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000446
447def urldefrag(url):
448 """Removes any existing fragment from URL.
449
450 Returns a tuple of the defragmented URL and the fragment. If
451 the URL contained no fragments, the second element is the
452 empty string.
453 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000454 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000455 if '#' in url:
456 s, n, p, a, q, frag = urlparse(url)
457 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000458 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000459 frag = ''
460 defrag = url
461 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000462
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000463def unquote_to_bytes(string):
464 """unquote_to_bytes('abc%20def') -> b'abc def'."""
465 # Note: strings are encoded as UTF-8. This is only an issue if it contains
466 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000467 if not string:
468 # Is it a string-like object?
469 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000470 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000471 if isinstance(string, str):
472 string = string.encode('utf-8')
473 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000474 if len(res) == 1:
475 return string
476 string = res[0]
477 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000478 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000479 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000480 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000481 string += b'%' + item
482 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000483
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000484def unquote(string, encoding='utf-8', errors='replace'):
485 """Replace %xx escapes by their single-character equivalent. The optional
486 encoding and errors parameters specify how to decode percent-encoded
487 sequences into Unicode characters, as accepted by the bytes.decode()
488 method.
489 By default, percent-encoded sequences are decoded with UTF-8, and invalid
490 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000491
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000492 unquote('abc%20def') -> 'abc def'.
493 """
Florent Xiclunac049fca2010-07-31 08:56:55 +0000494 if string == '':
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000495 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000496 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000497 if len(res) == 1:
498 return string
499 if encoding is None:
500 encoding = 'utf-8'
501 if errors is None:
502 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000503 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000504 pct_sequence = b''
505 string = res[0]
506 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000507 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000508 if not item:
509 raise ValueError
510 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000511 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000512 if not rest:
513 # This segment was just a single percent-encoded character.
514 # May be part of a sequence of code units, so delay decoding.
515 # (Stored in pct_sequence).
516 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000517 except ValueError:
518 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000519 # Encountered non-percent-encoded characters. Flush the current
520 # pct_sequence.
521 string += pct_sequence.decode(encoding, errors) + rest
522 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000523 if pct_sequence:
524 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000525 string += pct_sequence.decode(encoding, errors)
526 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000527
Victor Stinnerac71c542011-01-14 12:52:12 +0000528def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
529 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000530 """Parse a query given as a string argument.
531
532 Arguments:
533
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000534 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000535
536 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000537 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000538 A true value indicates that blanks should be retained as
539 blank strings. The default false value indicates that
540 blank values are to be ignored and treated as if they were
541 not included.
542
543 strict_parsing: flag indicating what to do with parsing errors.
544 If false (the default), errors are silently ignored.
545 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000546
547 encoding and errors: specify how to decode percent-encoded sequences
548 into Unicode characters, as accepted by the bytes.decode() method.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000549 """
550 dict = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000551 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
552 encoding=encoding, errors=errors)
553 for name, value in pairs:
Facundo Batistac469d4c2008-09-03 22:49:01 +0000554 if name in dict:
555 dict[name].append(value)
556 else:
557 dict[name] = [value]
558 return dict
559
Victor Stinnerac71c542011-01-14 12:52:12 +0000560def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
561 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000562 """Parse a query given as a string argument.
563
564 Arguments:
565
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000566 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000567
568 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000569 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000570 true value indicates that blanks should be retained as blank
571 strings. The default false value indicates that blank values
572 are to be ignored and treated as if they were not included.
573
574 strict_parsing: flag indicating what to do with parsing errors. If
575 false (the default), errors are silently ignored. If true,
576 errors raise a ValueError exception.
577
Victor Stinnerac71c542011-01-14 12:52:12 +0000578 encoding and errors: specify how to decode percent-encoded sequences
579 into Unicode characters, as accepted by the bytes.decode() method.
580
Facundo Batistac469d4c2008-09-03 22:49:01 +0000581 Returns a list, as G-d intended.
582 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000583 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000584 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
585 r = []
586 for name_value in pairs:
587 if not name_value and not strict_parsing:
588 continue
589 nv = name_value.split('=', 1)
590 if len(nv) != 2:
591 if strict_parsing:
592 raise ValueError("bad query field: %r" % (name_value,))
593 # Handle case of a control-name with no equal sign
594 if keep_blank_values:
595 nv.append('')
596 else:
597 continue
598 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000599 name = nv[0].replace('+', ' ')
600 name = unquote(name, encoding=encoding, errors=errors)
601 name = _coerce_result(name)
602 value = nv[1].replace('+', ' ')
603 value = unquote(value, encoding=encoding, errors=errors)
604 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000605 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000606 return r
607
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000608def unquote_plus(string, encoding='utf-8', errors='replace'):
609 """Like unquote(), but also replace plus signs by spaces, as required for
610 unquoting HTML form values.
611
612 unquote_plus('%7e/abc+def') -> '~/abc def'
613 """
614 string = string.replace('+', ' ')
615 return unquote(string, encoding, errors)
616
617_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
618 b'abcdefghijklmnopqrstuvwxyz'
619 b'0123456789'
620 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000621_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
622_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000623
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000624class Quoter(collections.defaultdict):
625 """A mapping from bytes (in range(0,256)) to strings.
626
627 String values are percent-encoded byte values, unless the key < 128, and
628 in the "safe" set (either the specified safe set, or default set).
629 """
630 # Keeps a cache internally, using defaultdict, for efficiency (lookups
631 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000632 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000633 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000634 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000635
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000636 def __repr__(self):
637 # Without this, will just display as a defaultdict
638 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000639
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000640 def __missing__(self, b):
641 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000642 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000643 self[b] = res
644 return res
645
646def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000647 """quote('abc def') -> 'abc%20def'
648
649 Each part of a URL, e.g. the path info, the query, etc., has a
650 different set of reserved characters that must be quoted.
651
652 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
653 the following reserved characters.
654
655 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
656 "$" | ","
657
658 Each of these characters is reserved in some component of a URL,
659 but not necessarily in all of them.
660
661 By default, the quote function is intended for quoting the path
662 section of a URL. Thus, it will not encode '/'. This character
663 is reserved, but in typical usage the quote function is being
664 called on a path where the existing slash characters are used as
665 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000666
667 string and safe may be either str or bytes objects. encoding must
668 not be specified if string is a str.
669
670 The optional encoding and errors parameters specify how to deal with
671 non-ASCII characters, as accepted by the str.encode method.
672 By default, encoding='utf-8' (characters are encoded with UTF-8), and
673 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000674 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000675 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000676 if not string:
677 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000678 if encoding is None:
679 encoding = 'utf-8'
680 if errors is None:
681 errors = 'strict'
682 string = string.encode(encoding, errors)
683 else:
684 if encoding is not None:
685 raise TypeError("quote() doesn't support 'encoding' for bytes")
686 if errors is not None:
687 raise TypeError("quote() doesn't support 'errors' for bytes")
688 return quote_from_bytes(string, safe)
689
690def quote_plus(string, safe='', encoding=None, errors=None):
691 """Like quote(), but also replace ' ' with '+', as required for quoting
692 HTML form values. Plus signs in the original string are escaped unless
693 they are included in safe. It also does not have safe default to '/'.
694 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000695 # Check if ' ' in string, where string may either be a str or bytes. If
696 # there are no spaces, the regular quote will produce the right answer.
697 if ((isinstance(string, str) and ' ' not in string) or
698 (isinstance(string, bytes) and b' ' not in string)):
699 return quote(string, safe, encoding, errors)
700 if isinstance(safe, str):
701 space = ' '
702 else:
703 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000704 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000705 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000706
707def quote_from_bytes(bs, safe='/'):
708 """Like quote(), but accepts a bytes object rather than a str, and does
709 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800710 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000711 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000712 if not isinstance(bs, (bytes, bytearray)):
713 raise TypeError("quote_from_bytes() expected bytes")
714 if not bs:
715 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000716 if isinstance(safe, str):
717 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
718 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000719 else:
720 safe = bytes([c for c in safe if c < 128])
721 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
722 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000723 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000724 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000725 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000726 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
727 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000728
Senthil Kumarandf022da2010-07-03 17:48:22 +0000729def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000730 """Encode a sequence of two-element tuples or dictionary into a URL query string.
731
732 If any values in the query arg are sequences and doseq is true, each
733 sequence element is converted to a separate parameter.
734
735 If the query arg is a sequence of two-element tuples, the order of the
736 parameters in the output will match the order of parameters in the
737 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000738
739 The query arg may be either a string or a bytes type. When query arg is a
740 string, the safe, encoding and error parameters are sent the quote_plus for
741 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000742 """
743
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000744 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000745 query = query.items()
746 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000747 # It's a bother at times that strings and string-like objects are
748 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000749 try:
750 # non-sequence items should not work with len()
751 # non-empty strings will fail this
752 if len(query) and not isinstance(query[0], tuple):
753 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000754 # Zero-length sequences of all types will get here and succeed,
755 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000756 # allowed empty dicts that type of behavior probably should be
757 # preserved for consistency
758 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000759 ty, va, tb = sys.exc_info()
760 raise TypeError("not a valid non-string sequence "
761 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000762
763 l = []
764 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000765 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000766 if isinstance(k, bytes):
767 k = quote_plus(k, safe)
768 else:
769 k = quote_plus(str(k), safe, encoding, errors)
770
771 if isinstance(v, bytes):
772 v = quote_plus(v, safe)
773 else:
774 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000775 l.append(k + '=' + v)
776 else:
777 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000778 if isinstance(k, bytes):
779 k = quote_plus(k, safe)
780 else:
781 k = quote_plus(str(k), safe, encoding, errors)
782
783 if isinstance(v, bytes):
784 v = quote_plus(v, safe)
785 l.append(k + '=' + v)
786 elif isinstance(v, str):
787 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000788 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000789 else:
790 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000791 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000792 x = len(v)
793 except TypeError:
794 # not a sequence
Senthil Kumarandf022da2010-07-03 17:48:22 +0000795 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000796 l.append(k + '=' + v)
797 else:
798 # loop over the sequence
799 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000800 if isinstance(elt, bytes):
801 elt = quote_plus(elt, safe)
802 else:
803 elt = quote_plus(str(elt), safe, encoding, errors)
804 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000805 return '&'.join(l)
806
807# Utilities to parse URLs (most of these return None for missing parts):
808# unwrap('<URL:type://host/path>') --> 'type://host/path'
809# splittype('type:opaquestring') --> 'type', 'opaquestring'
810# splithost('//host[:port]/path') --> 'host[:port]', '/path'
811# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
812# splitpasswd('user:passwd') -> 'user', 'passwd'
813# splitport('host:port') --> 'host', 'port'
814# splitquery('/path?query') --> '/path', 'query'
815# splittag('/path#tag') --> '/path', 'tag'
816# splitattr('/path;attr1=value1;attr2=value2;...') ->
817# '/path', ['attr1=value1', 'attr2=value2', ...]
818# splitvalue('attr=value') --> 'attr', 'value'
819# urllib.parse.unquote('abc%20def') -> 'abc def'
820# quote('abc def') -> 'abc%20def')
821
Georg Brandl13e89462008-07-01 19:56:00 +0000822def to_bytes(url):
823 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000824 # Most URL schemes require ASCII. If that changes, the conversion
825 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000826 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000827 if isinstance(url, str):
828 try:
829 url = url.encode("ASCII").decode()
830 except UnicodeError:
831 raise UnicodeError("URL " + repr(url) +
832 " contains non-ASCII characters")
833 return url
834
835def unwrap(url):
836 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
837 url = str(url).strip()
838 if url[:1] == '<' and url[-1:] == '>':
839 url = url[1:-1].strip()
840 if url[:4] == 'URL:': url = url[4:].strip()
841 return url
842
843_typeprog = None
844def splittype(url):
845 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
846 global _typeprog
847 if _typeprog is None:
848 import re
849 _typeprog = re.compile('^([^/:]+):')
850
851 match = _typeprog.match(url)
852 if match:
853 scheme = match.group(1)
854 return scheme.lower(), url[len(scheme) + 1:]
855 return None, url
856
857_hostprog = None
858def splithost(url):
859 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
860 global _hostprog
861 if _hostprog is None:
862 import re
863 _hostprog = re.compile('^//([^/?]*)(.*)$')
864
865 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000866 if match:
867 host_port = match.group(1)
868 path = match.group(2)
869 if path and not path.startswith('/'):
870 path = '/' + path
871 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000872 return None, url
873
874_userprog = None
875def splituser(host):
876 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
877 global _userprog
878 if _userprog is None:
879 import re
880 _userprog = re.compile('^(.*)@(.*)$')
881
882 match = _userprog.match(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +0000883 if match: return match.group(1, 2)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000884 return None, host
885
886_passwdprog = None
887def splitpasswd(user):
888 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
889 global _passwdprog
890 if _passwdprog is None:
891 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000892 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000893
894 match = _passwdprog.match(user)
895 if match: return match.group(1, 2)
896 return user, None
897
898# splittag('/path#tag') --> '/path', 'tag'
899_portprog = None
900def splitport(host):
901 """splitport('host:port') --> 'host', 'port'."""
902 global _portprog
903 if _portprog is None:
904 import re
905 _portprog = re.compile('^(.*):([0-9]+)$')
906
907 match = _portprog.match(host)
908 if match: return match.group(1, 2)
909 return host, None
910
911_nportprog = None
912def splitnport(host, defport=-1):
913 """Split host and port, returning numeric port.
914 Return given default port if no ':' found; defaults to -1.
915 Return numerical port if a valid number are found after ':'.
916 Return None if ':' but not a valid number."""
917 global _nportprog
918 if _nportprog is None:
919 import re
920 _nportprog = re.compile('^(.*):(.*)$')
921
922 match = _nportprog.match(host)
923 if match:
924 host, port = match.group(1, 2)
925 try:
926 if not port: raise ValueError("no digits")
927 nport = int(port)
928 except ValueError:
929 nport = None
930 return host, nport
931 return host, defport
932
933_queryprog = None
934def splitquery(url):
935 """splitquery('/path?query') --> '/path', 'query'."""
936 global _queryprog
937 if _queryprog is None:
938 import re
939 _queryprog = re.compile('^(.*)\?([^?]*)$')
940
941 match = _queryprog.match(url)
942 if match: return match.group(1, 2)
943 return url, None
944
945_tagprog = None
946def splittag(url):
947 """splittag('/path#tag') --> '/path', 'tag'."""
948 global _tagprog
949 if _tagprog is None:
950 import re
951 _tagprog = re.compile('^(.*)#([^#]*)$')
952
953 match = _tagprog.match(url)
954 if match: return match.group(1, 2)
955 return url, None
956
957def splitattr(url):
958 """splitattr('/path;attr1=value1;attr2=value2;...') ->
959 '/path', ['attr1=value1', 'attr2=value2', ...]."""
960 words = url.split(';')
961 return words[0], words[1:]
962
963_valueprog = None
964def splitvalue(attr):
965 """splitvalue('attr=value') --> 'attr', 'value'."""
966 global _valueprog
967 if _valueprog is None:
968 import re
969 _valueprog = re.compile('^([^=]*)=(.*)$')
970
971 match = _valueprog.match(attr)
972 if match: return match.group(1, 2)
973 return attr, None