blob: 45ae202d5c630b74a7a49f150ec1df1995bc62aa [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000034 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
35 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000036 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
41 'prospero', 'rtsp', 'rtspu', '', 'sftp']
42uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
43 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
44 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000045 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
47 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
48uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
49 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
50 'mms', '', 'sftp']
51uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
52 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
53uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
54 'nntp', 'wais', 'https', 'shttp', 'snews',
55 'file', 'prospero', '']
56
57# Characters valid in scheme names
58scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
59 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
60 '0123456789'
61 '+-.')
62
Nick Coghlan9fc443c2010-11-30 15:48:08 +000063# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064MAX_CACHE_SIZE = 20
65_parse_cache = {}
66
67def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000068 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000069 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000070 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000071
72
Nick Coghlan9fc443c2010-11-30 15:48:08 +000073# Helpers for bytes handling
74# For 3.2, we deliberately require applications that
75# handle improperly quoted URLs to do their own
76# decoding and encoding. If valid use cases are
77# presented, we may relax this by using latin-1
78# decoding internally for 3.3
79_implicit_encoding = 'ascii'
80_implicit_errors = 'strict'
81
82def _noop(obj):
83 return obj
84
85def _encode_result(obj, encoding=_implicit_encoding,
86 errors=_implicit_errors):
87 return obj.encode(encoding, errors)
88
89def _decode_args(args, encoding=_implicit_encoding,
90 errors=_implicit_errors):
91 return tuple(x.decode(encoding, errors) if x else '' for x in args)
92
93def _coerce_args(*args):
94 # Invokes decode if necessary to create str args
95 # and returns the coerced inputs along with
96 # an appropriate result coercion function
97 # - noop for str inputs
98 # - encoding function otherwise
99 str_input = isinstance(args[0], str)
100 for arg in args[1:]:
101 # We special-case the empty string to support the
102 # "scheme=''" default argument to some functions
103 if arg and isinstance(arg, str) != str_input:
104 raise TypeError("Cannot mix str and non-str arguments")
105 if str_input:
106 return args + (_noop,)
107 return _decode_args(args) + (_encode_result,)
108
109# Result objects are more helpful than simple tuples
110class _ResultMixinStr(object):
111 """Standard approach to encoding parsed results from str to bytes"""
112 __slots__ = ()
113
114 def encode(self, encoding='ascii', errors='strict'):
115 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
116
117
118class _ResultMixinBytes(object):
119 """Standard approach to decoding parsed results from bytes to str"""
120 __slots__ = ()
121
122 def decode(self, encoding='ascii', errors='strict'):
123 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
124
125
126class _NetlocResultMixinBase(object):
127 """Shared methods for the parsed result objects containing a netloc element"""
128 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000129
130 @property
131 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000132 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000133
134 @property
135 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000136 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000137
138 @property
139 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000140 hostname = self._hostinfo[0]
141 if not hostname:
142 hostname = None
143 elif hostname is not None:
144 hostname = hostname.lower()
145 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000146
147 @property
148 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000149 port = self._hostinfo[1]
150 if port is not None:
151 port = int(port, 10)
152 return port
153
154
155class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
156 __slots__ = ()
157
158 @property
159 def _userinfo(self):
160 netloc = self.netloc
161 userinfo, have_info, hostinfo = netloc.rpartition('@')
162 if have_info:
163 username, have_password, password = userinfo.partition(':')
164 if not have_password:
165 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000166 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000167 username = password = None
168 return username, password
169
170 @property
171 def _hostinfo(self):
172 netloc = self.netloc
173 _, _, hostinfo = netloc.rpartition('@')
174 _, have_open_br, bracketed = hostinfo.partition('[')
175 if have_open_br:
176 hostname, _, port = bracketed.partition(']')
177 _, have_port, port = port.partition(':')
178 else:
179 hostname, have_port, port = hostinfo.partition(':')
180 if not have_port:
181 port = None
182 return hostname, port
183
184
185class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
186 __slots__ = ()
187
188 @property
189 def _userinfo(self):
190 netloc = self.netloc
191 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
192 if have_info:
193 username, have_password, password = userinfo.partition(b':')
194 if not have_password:
195 password = None
196 else:
197 username = password = None
198 return username, password
199
200 @property
201 def _hostinfo(self):
202 netloc = self.netloc
203 _, _, hostinfo = netloc.rpartition(b'@')
204 _, have_open_br, bracketed = hostinfo.partition(b'[')
205 if have_open_br:
206 hostname, _, port = bracketed.partition(b']')
207 _, have_port, port = port.partition(b':')
208 else:
209 hostname, have_port, port = hostinfo.partition(b':')
210 if not have_port:
211 port = None
212 return hostname, port
213
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000214
215from collections import namedtuple
216
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000217_DefragResultBase = namedtuple('DefragResult', 'url fragment')
218_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
219_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000220
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000221# For backwards compatibility, alias _NetlocResultMixinStr
222# ResultBase is no longer part of the documented API, but it is
223# retained since deprecating it isn't worth the hassle
224ResultBase = _NetlocResultMixinStr
225
226# Structured result objects for string data
227class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000228 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000229 def geturl(self):
230 if self.fragment:
231 return self.url + '#' + self.fragment
232 else:
233 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000234
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000235class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
236 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000237 def geturl(self):
238 return urlunsplit(self)
239
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000240class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000241 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000242 def geturl(self):
243 return urlunparse(self)
244
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000245# Structured result objects for bytes data
246class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
247 __slots__ = ()
248 def geturl(self):
249 if self.fragment:
250 return self.url + b'#' + self.fragment
251 else:
252 return self.url
253
254class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
255 __slots__ = ()
256 def geturl(self):
257 return urlunsplit(self)
258
259class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
260 __slots__ = ()
261 def geturl(self):
262 return urlunparse(self)
263
264# Set up the encode/decode result pairs
265def _fix_result_transcoding():
266 _result_pairs = (
267 (DefragResult, DefragResultBytes),
268 (SplitResult, SplitResultBytes),
269 (ParseResult, ParseResultBytes),
270 )
271 for _decoded, _encoded in _result_pairs:
272 _decoded._encoded_counterpart = _encoded
273 _encoded._decoded_counterpart = _decoded
274
275_fix_result_transcoding()
276del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000277
278def urlparse(url, scheme='', allow_fragments=True):
279 """Parse a URL into 6 components:
280 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
281 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
282 Note that we don't break the components up in smaller bits
283 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000284 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000285 tuple = urlsplit(url, scheme, allow_fragments)
286 scheme, netloc, url, query, fragment = tuple
287 if scheme in uses_params and ';' in url:
288 url, params = _splitparams(url)
289 else:
290 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000291 result = ParseResult(scheme, netloc, url, params, query, fragment)
292 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000293
294def _splitparams(url):
295 if '/' in url:
296 i = url.find(';', url.rfind('/'))
297 if i < 0:
298 return url, ''
299 else:
300 i = url.find(';')
301 return url[:i], url[i+1:]
302
303def _splitnetloc(url, start=0):
304 delim = len(url) # position of end of domain part of url, default is end
305 for c in '/?#': # look for delimiters; the order is NOT important
306 wdelim = url.find(c, start) # find first of this delim
307 if wdelim >= 0: # if found
308 delim = min(delim, wdelim) # use earliest delim position
309 return url[start:delim], url[delim:] # return (domain, rest)
310
311def urlsplit(url, scheme='', allow_fragments=True):
312 """Parse a URL into 5 components:
313 <scheme>://<netloc>/<path>?<query>#<fragment>
314 Return a 5-tuple: (scheme, netloc, path, query, fragment).
315 Note that we don't break the components up in smaller bits
316 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000317 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000318 allow_fragments = bool(allow_fragments)
319 key = url, scheme, allow_fragments, type(url), type(scheme)
320 cached = _parse_cache.get(key, None)
321 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000322 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000323 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
324 clear_cache()
325 netloc = query = fragment = ''
326 i = url.find(':')
327 if i > 0:
328 if url[:i] == 'http': # optimize the common case
329 scheme = url[:i].lower()
330 url = url[i+1:]
331 if url[:2] == '//':
332 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000333 if (('[' in netloc and ']' not in netloc) or
334 (']' in netloc and '[' not in netloc)):
335 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000336 if allow_fragments and '#' in url:
337 url, fragment = url.split('#', 1)
338 if '?' in url:
339 url, query = url.split('?', 1)
340 v = SplitResult(scheme, netloc, url, query, fragment)
341 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000342 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800343 for c in url[:i]:
344 if c not in scheme_chars:
345 break
346 else:
347 try:
348 # make sure "url" is not actually a port number (in which case
349 # "scheme" is really part of the path
350 _testportnum = int(url[i+1:])
351 except ValueError:
Senthil Kumaran84c7d9f2010-08-04 04:50:44 +0000352 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran397eb442011-04-15 18:20:24 +0800353
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000354 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000355 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000356 if (('[' in netloc and ']' not in netloc) or
357 (']' in netloc and '[' not in netloc)):
358 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359 if allow_fragments and scheme in uses_fragment and '#' in url:
360 url, fragment = url.split('#', 1)
361 if scheme in uses_query and '?' in url:
362 url, query = url.split('?', 1)
363 v = SplitResult(scheme, netloc, url, query, fragment)
364 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000365 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000366
367def urlunparse(components):
368 """Put a parsed URL back together again. This may result in a
369 slightly different, but equivalent URL, if the URL that was parsed
370 originally had redundant delimiters, e.g. a ? with an empty query
371 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000372 scheme, netloc, url, params, query, fragment, _coerce_result = (
373 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000374 if params:
375 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000376 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000377
378def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000379 """Combine the elements of a tuple as returned by urlsplit() into a
380 complete URL as a string. The data argument can be any five-item iterable.
381 This may result in a slightly different, but equivalent URL, if the URL that
382 was parsed originally had unnecessary delimiters (for example, a ? with an
383 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000384 scheme, netloc, url, query, fragment, _coerce_result = (
385 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000386 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
387 if url and url[:1] != '/': url = '/' + url
388 url = '//' + (netloc or '') + url
389 if scheme:
390 url = scheme + ':' + url
391 if query:
392 url = url + '?' + query
393 if fragment:
394 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000395 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000396
397def urljoin(base, url, allow_fragments=True):
398 """Join a base URL and a possibly relative URL to form an absolute
399 interpretation of the latter."""
400 if not base:
401 return url
402 if not url:
403 return base
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000404 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000405 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
406 urlparse(base, '', allow_fragments)
407 scheme, netloc, path, params, query, fragment = \
408 urlparse(url, bscheme, allow_fragments)
409 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000410 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000411 if scheme in uses_netloc:
412 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000413 return _coerce_result(urlunparse((scheme, netloc, path,
414 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000415 netloc = bnetloc
416 if path[:1] == '/':
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000417 return _coerce_result(urlunparse((scheme, netloc, path,
418 params, query, fragment)))
Senthil Kumarandca5b862010-12-17 04:48:45 +0000419 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000420 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000421 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000422 if not query:
423 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000424 return _coerce_result(urlunparse((scheme, netloc, path,
425 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000426 segments = bpath.split('/')[:-1] + path.split('/')
427 # XXX The stuff below is bogus in various ways...
428 if segments[-1] == '.':
429 segments[-1] = ''
430 while '.' in segments:
431 segments.remove('.')
432 while 1:
433 i = 1
434 n = len(segments) - 1
435 while i < n:
436 if (segments[i] == '..'
437 and segments[i-1] not in ('', '..')):
438 del segments[i-1:i+1]
439 break
440 i = i+1
441 else:
442 break
443 if segments == ['', '..']:
444 segments[-1] = ''
445 elif len(segments) >= 2 and segments[-1] == '..':
446 segments[-2:] = ['']
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000447 return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),
448 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000449
450def urldefrag(url):
451 """Removes any existing fragment from URL.
452
453 Returns a tuple of the defragmented URL and the fragment. If
454 the URL contained no fragments, the second element is the
455 empty string.
456 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000457 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000458 if '#' in url:
459 s, n, p, a, q, frag = urlparse(url)
460 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000461 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000462 frag = ''
463 defrag = url
464 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000465
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000466def unquote_to_bytes(string):
467 """unquote_to_bytes('abc%20def') -> b'abc def'."""
468 # Note: strings are encoded as UTF-8. This is only an issue if it contains
469 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000470 if not string:
471 # Is it a string-like object?
472 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000473 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000474 if isinstance(string, str):
475 string = string.encode('utf-8')
476 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000477 if len(res) == 1:
478 return string
479 string = res[0]
480 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000481 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000482 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000483 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000484 string += b'%' + item
485 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000486
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000487def unquote(string, encoding='utf-8', errors='replace'):
488 """Replace %xx escapes by their single-character equivalent. The optional
489 encoding and errors parameters specify how to decode percent-encoded
490 sequences into Unicode characters, as accepted by the bytes.decode()
491 method.
492 By default, percent-encoded sequences are decoded with UTF-8, and invalid
493 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000494
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000495 unquote('abc%20def') -> 'abc def'.
496 """
Florent Xiclunac049fca2010-07-31 08:56:55 +0000497 if string == '':
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000498 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000499 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000500 if len(res) == 1:
501 return string
502 if encoding is None:
503 encoding = 'utf-8'
504 if errors is None:
505 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000506 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000507 pct_sequence = b''
508 string = res[0]
509 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000510 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000511 if not item:
512 raise ValueError
513 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000514 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000515 if not rest:
516 # This segment was just a single percent-encoded character.
517 # May be part of a sequence of code units, so delay decoding.
518 # (Stored in pct_sequence).
519 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000520 except ValueError:
521 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000522 # Encountered non-percent-encoded characters. Flush the current
523 # pct_sequence.
524 string += pct_sequence.decode(encoding, errors) + rest
525 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000526 if pct_sequence:
527 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000528 string += pct_sequence.decode(encoding, errors)
529 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000530
Victor Stinnerac71c542011-01-14 12:52:12 +0000531def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
532 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000533 """Parse a query given as a string argument.
534
535 Arguments:
536
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000537 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000538
539 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000540 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000541 A true value indicates that blanks should be retained as
542 blank strings. The default false value indicates that
543 blank values are to be ignored and treated as if they were
544 not included.
545
546 strict_parsing: flag indicating what to do with parsing errors.
547 If false (the default), errors are silently ignored.
548 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000549
550 encoding and errors: specify how to decode percent-encoded sequences
551 into Unicode characters, as accepted by the bytes.decode() method.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000552 """
553 dict = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000554 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
555 encoding=encoding, errors=errors)
556 for name, value in pairs:
Facundo Batistac469d4c2008-09-03 22:49:01 +0000557 if name in dict:
558 dict[name].append(value)
559 else:
560 dict[name] = [value]
561 return dict
562
Victor Stinnerac71c542011-01-14 12:52:12 +0000563def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
564 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000565 """Parse a query given as a string argument.
566
567 Arguments:
568
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000569 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000570
571 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000572 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000573 true value indicates that blanks should be retained as blank
574 strings. The default false value indicates that blank values
575 are to be ignored and treated as if they were not included.
576
577 strict_parsing: flag indicating what to do with parsing errors. If
578 false (the default), errors are silently ignored. If true,
579 errors raise a ValueError exception.
580
Victor Stinnerac71c542011-01-14 12:52:12 +0000581 encoding and errors: specify how to decode percent-encoded sequences
582 into Unicode characters, as accepted by the bytes.decode() method.
583
Facundo Batistac469d4c2008-09-03 22:49:01 +0000584 Returns a list, as G-d intended.
585 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000586 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000587 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
588 r = []
589 for name_value in pairs:
590 if not name_value and not strict_parsing:
591 continue
592 nv = name_value.split('=', 1)
593 if len(nv) != 2:
594 if strict_parsing:
595 raise ValueError("bad query field: %r" % (name_value,))
596 # Handle case of a control-name with no equal sign
597 if keep_blank_values:
598 nv.append('')
599 else:
600 continue
601 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000602 name = nv[0].replace('+', ' ')
603 name = unquote(name, encoding=encoding, errors=errors)
604 name = _coerce_result(name)
605 value = nv[1].replace('+', ' ')
606 value = unquote(value, encoding=encoding, errors=errors)
607 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000608 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000609 return r
610
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000611def unquote_plus(string, encoding='utf-8', errors='replace'):
612 """Like unquote(), but also replace plus signs by spaces, as required for
613 unquoting HTML form values.
614
615 unquote_plus('%7e/abc+def') -> '~/abc def'
616 """
617 string = string.replace('+', ' ')
618 return unquote(string, encoding, errors)
619
620_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
621 b'abcdefghijklmnopqrstuvwxyz'
622 b'0123456789'
623 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000624_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
625_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000626
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000627class Quoter(collections.defaultdict):
628 """A mapping from bytes (in range(0,256)) to strings.
629
630 String values are percent-encoded byte values, unless the key < 128, and
631 in the "safe" set (either the specified safe set, or default set).
632 """
633 # Keeps a cache internally, using defaultdict, for efficiency (lookups
634 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000635 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000636 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000637 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000638
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000639 def __repr__(self):
640 # Without this, will just display as a defaultdict
641 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000642
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000643 def __missing__(self, b):
644 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000645 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000646 self[b] = res
647 return res
648
649def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000650 """quote('abc def') -> 'abc%20def'
651
652 Each part of a URL, e.g. the path info, the query, etc., has a
653 different set of reserved characters that must be quoted.
654
655 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
656 the following reserved characters.
657
658 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
659 "$" | ","
660
661 Each of these characters is reserved in some component of a URL,
662 but not necessarily in all of them.
663
664 By default, the quote function is intended for quoting the path
665 section of a URL. Thus, it will not encode '/'. This character
666 is reserved, but in typical usage the quote function is being
667 called on a path where the existing slash characters are used as
668 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000669
670 string and safe may be either str or bytes objects. encoding must
671 not be specified if string is a str.
672
673 The optional encoding and errors parameters specify how to deal with
674 non-ASCII characters, as accepted by the str.encode method.
675 By default, encoding='utf-8' (characters are encoded with UTF-8), and
676 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000677 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000678 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000679 if not string:
680 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000681 if encoding is None:
682 encoding = 'utf-8'
683 if errors is None:
684 errors = 'strict'
685 string = string.encode(encoding, errors)
686 else:
687 if encoding is not None:
688 raise TypeError("quote() doesn't support 'encoding' for bytes")
689 if errors is not None:
690 raise TypeError("quote() doesn't support 'errors' for bytes")
691 return quote_from_bytes(string, safe)
692
693def quote_plus(string, safe='', encoding=None, errors=None):
694 """Like quote(), but also replace ' ' with '+', as required for quoting
695 HTML form values. Plus signs in the original string are escaped unless
696 they are included in safe. It also does not have safe default to '/'.
697 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000698 # Check if ' ' in string, where string may either be a str or bytes. If
699 # there are no spaces, the regular quote will produce the right answer.
700 if ((isinstance(string, str) and ' ' not in string) or
701 (isinstance(string, bytes) and b' ' not in string)):
702 return quote(string, safe, encoding, errors)
703 if isinstance(safe, str):
704 space = ' '
705 else:
706 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000707 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000708 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000709
710def quote_from_bytes(bs, safe='/'):
711 """Like quote(), but accepts a bytes object rather than a str, and does
712 not perform string-to-bytes encoding. It always returns an ASCII string.
713 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
714 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000715 if not isinstance(bs, (bytes, bytearray)):
716 raise TypeError("quote_from_bytes() expected bytes")
717 if not bs:
718 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000719 if isinstance(safe, str):
720 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
721 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000722 else:
723 safe = bytes([c for c in safe if c < 128])
724 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
725 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000726 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000727 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000728 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000729 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
730 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000731
Senthil Kumarandf022da2010-07-03 17:48:22 +0000732def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000733 """Encode a sequence of two-element tuples or dictionary into a URL query string.
734
735 If any values in the query arg are sequences and doseq is true, each
736 sequence element is converted to a separate parameter.
737
738 If the query arg is a sequence of two-element tuples, the order of the
739 parameters in the output will match the order of parameters in the
740 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000741
742 The query arg may be either a string or a bytes type. When query arg is a
743 string, the safe, encoding and error parameters are sent the quote_plus for
744 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000745 """
746
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000747 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000748 query = query.items()
749 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000750 # It's a bother at times that strings and string-like objects are
751 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000752 try:
753 # non-sequence items should not work with len()
754 # non-empty strings will fail this
755 if len(query) and not isinstance(query[0], tuple):
756 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000757 # Zero-length sequences of all types will get here and succeed,
758 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000759 # allowed empty dicts that type of behavior probably should be
760 # preserved for consistency
761 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000762 ty, va, tb = sys.exc_info()
763 raise TypeError("not a valid non-string sequence "
764 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000765
766 l = []
767 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000768 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000769 if isinstance(k, bytes):
770 k = quote_plus(k, safe)
771 else:
772 k = quote_plus(str(k), safe, encoding, errors)
773
774 if isinstance(v, bytes):
775 v = quote_plus(v, safe)
776 else:
777 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000778 l.append(k + '=' + v)
779 else:
780 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000781 if isinstance(k, bytes):
782 k = quote_plus(k, safe)
783 else:
784 k = quote_plus(str(k), safe, encoding, errors)
785
786 if isinstance(v, bytes):
787 v = quote_plus(v, safe)
788 l.append(k + '=' + v)
789 elif isinstance(v, str):
790 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000791 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000792 else:
793 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000794 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000795 x = len(v)
796 except TypeError:
797 # not a sequence
Senthil Kumarandf022da2010-07-03 17:48:22 +0000798 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000799 l.append(k + '=' + v)
800 else:
801 # loop over the sequence
802 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000803 if isinstance(elt, bytes):
804 elt = quote_plus(elt, safe)
805 else:
806 elt = quote_plus(str(elt), safe, encoding, errors)
807 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000808 return '&'.join(l)
809
810# Utilities to parse URLs (most of these return None for missing parts):
811# unwrap('<URL:type://host/path>') --> 'type://host/path'
812# splittype('type:opaquestring') --> 'type', 'opaquestring'
813# splithost('//host[:port]/path') --> 'host[:port]', '/path'
814# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
815# splitpasswd('user:passwd') -> 'user', 'passwd'
816# splitport('host:port') --> 'host', 'port'
817# splitquery('/path?query') --> '/path', 'query'
818# splittag('/path#tag') --> '/path', 'tag'
819# splitattr('/path;attr1=value1;attr2=value2;...') ->
820# '/path', ['attr1=value1', 'attr2=value2', ...]
821# splitvalue('attr=value') --> 'attr', 'value'
822# urllib.parse.unquote('abc%20def') -> 'abc def'
823# quote('abc def') -> 'abc%20def')
824
Georg Brandl13e89462008-07-01 19:56:00 +0000825def to_bytes(url):
826 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000827 # Most URL schemes require ASCII. If that changes, the conversion
828 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000829 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000830 if isinstance(url, str):
831 try:
832 url = url.encode("ASCII").decode()
833 except UnicodeError:
834 raise UnicodeError("URL " + repr(url) +
835 " contains non-ASCII characters")
836 return url
837
838def unwrap(url):
839 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
840 url = str(url).strip()
841 if url[:1] == '<' and url[-1:] == '>':
842 url = url[1:-1].strip()
843 if url[:4] == 'URL:': url = url[4:].strip()
844 return url
845
846_typeprog = None
847def splittype(url):
848 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
849 global _typeprog
850 if _typeprog is None:
851 import re
852 _typeprog = re.compile('^([^/:]+):')
853
854 match = _typeprog.match(url)
855 if match:
856 scheme = match.group(1)
857 return scheme.lower(), url[len(scheme) + 1:]
858 return None, url
859
860_hostprog = None
861def splithost(url):
862 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
863 global _hostprog
864 if _hostprog is None:
865 import re
866 _hostprog = re.compile('^//([^/?]*)(.*)$')
867
868 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000869 if match:
870 host_port = match.group(1)
871 path = match.group(2)
872 if path and not path.startswith('/'):
873 path = '/' + path
874 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000875 return None, url
876
877_userprog = None
878def splituser(host):
879 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
880 global _userprog
881 if _userprog is None:
882 import re
883 _userprog = re.compile('^(.*)@(.*)$')
884
885 match = _userprog.match(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +0000886 if match: return match.group(1, 2)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000887 return None, host
888
889_passwdprog = None
890def splitpasswd(user):
891 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
892 global _passwdprog
893 if _passwdprog is None:
894 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000895 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000896
897 match = _passwdprog.match(user)
898 if match: return match.group(1, 2)
899 return user, None
900
901# splittag('/path#tag') --> '/path', 'tag'
902_portprog = None
903def splitport(host):
904 """splitport('host:port') --> 'host', 'port'."""
905 global _portprog
906 if _portprog is None:
907 import re
908 _portprog = re.compile('^(.*):([0-9]+)$')
909
910 match = _portprog.match(host)
911 if match: return match.group(1, 2)
912 return host, None
913
914_nportprog = None
915def splitnport(host, defport=-1):
916 """Split host and port, returning numeric port.
917 Return given default port if no ':' found; defaults to -1.
918 Return numerical port if a valid number are found after ':'.
919 Return None if ':' but not a valid number."""
920 global _nportprog
921 if _nportprog is None:
922 import re
923 _nportprog = re.compile('^(.*):(.*)$')
924
925 match = _nportprog.match(host)
926 if match:
927 host, port = match.group(1, 2)
928 try:
929 if not port: raise ValueError("no digits")
930 nport = int(port)
931 except ValueError:
932 nport = None
933 return host, nport
934 return host, defport
935
936_queryprog = None
937def splitquery(url):
938 """splitquery('/path?query') --> '/path', 'query'."""
939 global _queryprog
940 if _queryprog is None:
941 import re
942 _queryprog = re.compile('^(.*)\?([^?]*)$')
943
944 match = _queryprog.match(url)
945 if match: return match.group(1, 2)
946 return url, None
947
948_tagprog = None
949def splittag(url):
950 """splittag('/path#tag') --> '/path', 'tag'."""
951 global _tagprog
952 if _tagprog is None:
953 import re
954 _tagprog = re.compile('^(.*)#([^#]*)$')
955
956 match = _tagprog.match(url)
957 if match: return match.group(1, 2)
958 return url, None
959
960def splitattr(url):
961 """splitattr('/path;attr1=value1;attr2=value2;...') ->
962 '/path', ['attr1=value1', 'attr2=value2', ...]."""
963 words = url.split(';')
964 return words[0], words[1:]
965
966_valueprog = None
967def splitvalue(attr):
968 """splitvalue('attr=value') --> 'attr', 'value'."""
969 global _valueprog
970 if _valueprog is None:
971 import re
972 _valueprog = re.compile('^([^=]*)=(.*)$')
973
974 match = _valueprog.match(attr)
975 if match: return match.group(1, 2)
976 return attr, None