blob: 01067ae6ac015fe2d229262601c782930bfd4cfc [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000034 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
35 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000036 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran2a157d22011-08-03 18:37:22 +080041 'prospero', 'rtsp', 'rtspu', '', 'sftp',
42 'svn', 'svn+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000043uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
44 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
45 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000046 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000047non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
48 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
49uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
50 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
51 'mms', '', 'sftp']
52uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
53 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
54uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
55 'nntp', 'wais', 'https', 'shttp', 'snews',
56 'file', 'prospero', '']
57
58# Characters valid in scheme names
59scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
60 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
61 '0123456789'
62 '+-.')
63
Nick Coghlan9fc443c2010-11-30 15:48:08 +000064# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000065MAX_CACHE_SIZE = 20
66_parse_cache = {}
67
68def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000069 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000070 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000071 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000072
73
Nick Coghlan9fc443c2010-11-30 15:48:08 +000074# Helpers for bytes handling
75# For 3.2, we deliberately require applications that
76# handle improperly quoted URLs to do their own
77# decoding and encoding. If valid use cases are
78# presented, we may relax this by using latin-1
79# decoding internally for 3.3
80_implicit_encoding = 'ascii'
81_implicit_errors = 'strict'
82
83def _noop(obj):
84 return obj
85
86def _encode_result(obj, encoding=_implicit_encoding,
87 errors=_implicit_errors):
88 return obj.encode(encoding, errors)
89
90def _decode_args(args, encoding=_implicit_encoding,
91 errors=_implicit_errors):
92 return tuple(x.decode(encoding, errors) if x else '' for x in args)
93
94def _coerce_args(*args):
95 # Invokes decode if necessary to create str args
96 # and returns the coerced inputs along with
97 # an appropriate result coercion function
98 # - noop for str inputs
99 # - encoding function otherwise
100 str_input = isinstance(args[0], str)
101 for arg in args[1:]:
102 # We special-case the empty string to support the
103 # "scheme=''" default argument to some functions
104 if arg and isinstance(arg, str) != str_input:
105 raise TypeError("Cannot mix str and non-str arguments")
106 if str_input:
107 return args + (_noop,)
108 return _decode_args(args) + (_encode_result,)
109
110# Result objects are more helpful than simple tuples
111class _ResultMixinStr(object):
112 """Standard approach to encoding parsed results from str to bytes"""
113 __slots__ = ()
114
115 def encode(self, encoding='ascii', errors='strict'):
116 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
117
118
119class _ResultMixinBytes(object):
120 """Standard approach to decoding parsed results from bytes to str"""
121 __slots__ = ()
122
123 def decode(self, encoding='ascii', errors='strict'):
124 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
125
126
127class _NetlocResultMixinBase(object):
128 """Shared methods for the parsed result objects containing a netloc element"""
129 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000130
131 @property
132 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000133 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000134
135 @property
136 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000137 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000138
139 @property
140 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000141 hostname = self._hostinfo[0]
142 if not hostname:
143 hostname = None
144 elif hostname is not None:
145 hostname = hostname.lower()
146 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000147
148 @property
149 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000150 port = self._hostinfo[1]
151 if port is not None:
152 port = int(port, 10)
153 return port
154
155
156class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
157 __slots__ = ()
158
159 @property
160 def _userinfo(self):
161 netloc = self.netloc
162 userinfo, have_info, hostinfo = netloc.rpartition('@')
163 if have_info:
164 username, have_password, password = userinfo.partition(':')
165 if not have_password:
166 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000167 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000168 username = password = None
169 return username, password
170
171 @property
172 def _hostinfo(self):
173 netloc = self.netloc
174 _, _, hostinfo = netloc.rpartition('@')
175 _, have_open_br, bracketed = hostinfo.partition('[')
176 if have_open_br:
177 hostname, _, port = bracketed.partition(']')
178 _, have_port, port = port.partition(':')
179 else:
180 hostname, have_port, port = hostinfo.partition(':')
181 if not have_port:
182 port = None
183 return hostname, port
184
185
186class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
187 __slots__ = ()
188
189 @property
190 def _userinfo(self):
191 netloc = self.netloc
192 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
193 if have_info:
194 username, have_password, password = userinfo.partition(b':')
195 if not have_password:
196 password = None
197 else:
198 username = password = None
199 return username, password
200
201 @property
202 def _hostinfo(self):
203 netloc = self.netloc
204 _, _, hostinfo = netloc.rpartition(b'@')
205 _, have_open_br, bracketed = hostinfo.partition(b'[')
206 if have_open_br:
207 hostname, _, port = bracketed.partition(b']')
208 _, have_port, port = port.partition(b':')
209 else:
210 hostname, have_port, port = hostinfo.partition(b':')
211 if not have_port:
212 port = None
213 return hostname, port
214
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000215
216from collections import namedtuple
217
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000218_DefragResultBase = namedtuple('DefragResult', 'url fragment')
219_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
220_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000221
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000222# For backwards compatibility, alias _NetlocResultMixinStr
223# ResultBase is no longer part of the documented API, but it is
224# retained since deprecating it isn't worth the hassle
225ResultBase = _NetlocResultMixinStr
226
227# Structured result objects for string data
228class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000229 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000230 def geturl(self):
231 if self.fragment:
232 return self.url + '#' + self.fragment
233 else:
234 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000235
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000236class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
237 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000238 def geturl(self):
239 return urlunsplit(self)
240
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000241class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000242 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000243 def geturl(self):
244 return urlunparse(self)
245
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000246# Structured result objects for bytes data
247class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
248 __slots__ = ()
249 def geturl(self):
250 if self.fragment:
251 return self.url + b'#' + self.fragment
252 else:
253 return self.url
254
255class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
256 __slots__ = ()
257 def geturl(self):
258 return urlunsplit(self)
259
260class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
261 __slots__ = ()
262 def geturl(self):
263 return urlunparse(self)
264
265# Set up the encode/decode result pairs
266def _fix_result_transcoding():
267 _result_pairs = (
268 (DefragResult, DefragResultBytes),
269 (SplitResult, SplitResultBytes),
270 (ParseResult, ParseResultBytes),
271 )
272 for _decoded, _encoded in _result_pairs:
273 _decoded._encoded_counterpart = _encoded
274 _encoded._decoded_counterpart = _decoded
275
276_fix_result_transcoding()
277del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000278
279def urlparse(url, scheme='', allow_fragments=True):
280 """Parse a URL into 6 components:
281 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
282 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
283 Note that we don't break the components up in smaller bits
284 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000285 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000286 tuple = urlsplit(url, scheme, allow_fragments)
287 scheme, netloc, url, query, fragment = tuple
288 if scheme in uses_params and ';' in url:
289 url, params = _splitparams(url)
290 else:
291 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000292 result = ParseResult(scheme, netloc, url, params, query, fragment)
293 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000294
295def _splitparams(url):
296 if '/' in url:
297 i = url.find(';', url.rfind('/'))
298 if i < 0:
299 return url, ''
300 else:
301 i = url.find(';')
302 return url[:i], url[i+1:]
303
304def _splitnetloc(url, start=0):
305 delim = len(url) # position of end of domain part of url, default is end
306 for c in '/?#': # look for delimiters; the order is NOT important
307 wdelim = url.find(c, start) # find first of this delim
308 if wdelim >= 0: # if found
309 delim = min(delim, wdelim) # use earliest delim position
310 return url[start:delim], url[delim:] # return (domain, rest)
311
312def urlsplit(url, scheme='', allow_fragments=True):
313 """Parse a URL into 5 components:
314 <scheme>://<netloc>/<path>?<query>#<fragment>
315 Return a 5-tuple: (scheme, netloc, path, query, fragment).
316 Note that we don't break the components up in smaller bits
317 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000318 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000319 allow_fragments = bool(allow_fragments)
320 key = url, scheme, allow_fragments, type(url), type(scheme)
321 cached = _parse_cache.get(key, None)
322 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000323 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000324 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
325 clear_cache()
326 netloc = query = fragment = ''
327 i = url.find(':')
328 if i > 0:
329 if url[:i] == 'http': # optimize the common case
330 scheme = url[:i].lower()
331 url = url[i+1:]
332 if url[:2] == '//':
333 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000334 if (('[' in netloc and ']' not in netloc) or
335 (']' in netloc and '[' not in netloc)):
336 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000337 if allow_fragments and '#' in url:
338 url, fragment = url.split('#', 1)
339 if '?' in url:
340 url, query = url.split('?', 1)
341 v = SplitResult(scheme, netloc, url, query, fragment)
342 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000343 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800344 for c in url[:i]:
345 if c not in scheme_chars:
346 break
347 else:
348 try:
349 # make sure "url" is not actually a port number (in which case
350 # "scheme" is really part of the path
351 _testportnum = int(url[i+1:])
352 except ValueError:
Senthil Kumaran84c7d9f2010-08-04 04:50:44 +0000353 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran397eb442011-04-15 18:20:24 +0800354
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000355 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000356 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000357 if (('[' in netloc and ']' not in netloc) or
358 (']' in netloc and '[' not in netloc)):
359 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000360 if allow_fragments and scheme in uses_fragment and '#' in url:
361 url, fragment = url.split('#', 1)
362 if scheme in uses_query and '?' in url:
363 url, query = url.split('?', 1)
364 v = SplitResult(scheme, netloc, url, query, fragment)
365 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000366 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000367
368def urlunparse(components):
369 """Put a parsed URL back together again. This may result in a
370 slightly different, but equivalent URL, if the URL that was parsed
371 originally had redundant delimiters, e.g. a ? with an empty query
372 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000373 scheme, netloc, url, params, query, fragment, _coerce_result = (
374 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000375 if params:
376 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000377 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000378
379def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000380 """Combine the elements of a tuple as returned by urlsplit() into a
381 complete URL as a string. The data argument can be any five-item iterable.
382 This may result in a slightly different, but equivalent URL, if the URL that
383 was parsed originally had unnecessary delimiters (for example, a ? with an
384 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000385 scheme, netloc, url, query, fragment, _coerce_result = (
386 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000387 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
388 if url and url[:1] != '/': url = '/' + url
389 url = '//' + (netloc or '') + url
390 if scheme:
391 url = scheme + ':' + url
392 if query:
393 url = url + '?' + query
394 if fragment:
395 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000396 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000397
398def urljoin(base, url, allow_fragments=True):
399 """Join a base URL and a possibly relative URL to form an absolute
400 interpretation of the latter."""
401 if not base:
402 return url
403 if not url:
404 return base
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000405 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000406 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
407 urlparse(base, '', allow_fragments)
408 scheme, netloc, path, params, query, fragment = \
409 urlparse(url, bscheme, allow_fragments)
410 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000411 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000412 if scheme in uses_netloc:
413 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000414 return _coerce_result(urlunparse((scheme, netloc, path,
415 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000416 netloc = bnetloc
417 if path[:1] == '/':
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000418 return _coerce_result(urlunparse((scheme, netloc, path,
419 params, query, fragment)))
Senthil Kumarandca5b862010-12-17 04:48:45 +0000420 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000421 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000422 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000423 if not query:
424 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000425 return _coerce_result(urlunparse((scheme, netloc, path,
426 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000427 segments = bpath.split('/')[:-1] + path.split('/')
428 # XXX The stuff below is bogus in various ways...
429 if segments[-1] == '.':
430 segments[-1] = ''
431 while '.' in segments:
432 segments.remove('.')
433 while 1:
434 i = 1
435 n = len(segments) - 1
436 while i < n:
437 if (segments[i] == '..'
438 and segments[i-1] not in ('', '..')):
439 del segments[i-1:i+1]
440 break
441 i = i+1
442 else:
443 break
444 if segments == ['', '..']:
445 segments[-1] = ''
446 elif len(segments) >= 2 and segments[-1] == '..':
447 segments[-2:] = ['']
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000448 return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),
449 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000450
451def urldefrag(url):
452 """Removes any existing fragment from URL.
453
454 Returns a tuple of the defragmented URL and the fragment. If
455 the URL contained no fragments, the second element is the
456 empty string.
457 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000458 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000459 if '#' in url:
460 s, n, p, a, q, frag = urlparse(url)
461 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000462 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000463 frag = ''
464 defrag = url
465 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000466
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000467def unquote_to_bytes(string):
468 """unquote_to_bytes('abc%20def') -> b'abc def'."""
469 # Note: strings are encoded as UTF-8. This is only an issue if it contains
470 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000471 if not string:
472 # Is it a string-like object?
473 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000474 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000475 if isinstance(string, str):
476 string = string.encode('utf-8')
477 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000478 if len(res) == 1:
479 return string
480 string = res[0]
481 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000482 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000483 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000484 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000485 string += b'%' + item
486 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000487
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000488def unquote(string, encoding='utf-8', errors='replace'):
489 """Replace %xx escapes by their single-character equivalent. The optional
490 encoding and errors parameters specify how to decode percent-encoded
491 sequences into Unicode characters, as accepted by the bytes.decode()
492 method.
493 By default, percent-encoded sequences are decoded with UTF-8, and invalid
494 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000495
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000496 unquote('abc%20def') -> 'abc def'.
497 """
Florent Xiclunac049fca2010-07-31 08:56:55 +0000498 if string == '':
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000499 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000500 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000501 if len(res) == 1:
502 return string
503 if encoding is None:
504 encoding = 'utf-8'
505 if errors is None:
506 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000507 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000508 pct_sequence = b''
509 string = res[0]
510 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000511 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000512 if not item:
513 raise ValueError
514 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000515 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000516 if not rest:
517 # This segment was just a single percent-encoded character.
518 # May be part of a sequence of code units, so delay decoding.
519 # (Stored in pct_sequence).
520 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000521 except ValueError:
522 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000523 # Encountered non-percent-encoded characters. Flush the current
524 # pct_sequence.
525 string += pct_sequence.decode(encoding, errors) + rest
526 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000527 if pct_sequence:
528 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000529 string += pct_sequence.decode(encoding, errors)
530 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000531
Victor Stinnerac71c542011-01-14 12:52:12 +0000532def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
533 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000534 """Parse a query given as a string argument.
535
536 Arguments:
537
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000538 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000539
540 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000541 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000542 A true value indicates that blanks should be retained as
543 blank strings. The default false value indicates that
544 blank values are to be ignored and treated as if they were
545 not included.
546
547 strict_parsing: flag indicating what to do with parsing errors.
548 If false (the default), errors are silently ignored.
549 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000550
551 encoding and errors: specify how to decode percent-encoded sequences
552 into Unicode characters, as accepted by the bytes.decode() method.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000553 """
554 dict = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000555 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
556 encoding=encoding, errors=errors)
557 for name, value in pairs:
Facundo Batistac469d4c2008-09-03 22:49:01 +0000558 if name in dict:
559 dict[name].append(value)
560 else:
561 dict[name] = [value]
562 return dict
563
Victor Stinnerac71c542011-01-14 12:52:12 +0000564def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
565 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000566 """Parse a query given as a string argument.
567
568 Arguments:
569
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000570 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000571
572 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000573 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000574 true value indicates that blanks should be retained as blank
575 strings. The default false value indicates that blank values
576 are to be ignored and treated as if they were not included.
577
578 strict_parsing: flag indicating what to do with parsing errors. If
579 false (the default), errors are silently ignored. If true,
580 errors raise a ValueError exception.
581
Victor Stinnerac71c542011-01-14 12:52:12 +0000582 encoding and errors: specify how to decode percent-encoded sequences
583 into Unicode characters, as accepted by the bytes.decode() method.
584
Facundo Batistac469d4c2008-09-03 22:49:01 +0000585 Returns a list, as G-d intended.
586 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000587 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000588 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
589 r = []
590 for name_value in pairs:
591 if not name_value and not strict_parsing:
592 continue
593 nv = name_value.split('=', 1)
594 if len(nv) != 2:
595 if strict_parsing:
596 raise ValueError("bad query field: %r" % (name_value,))
597 # Handle case of a control-name with no equal sign
598 if keep_blank_values:
599 nv.append('')
600 else:
601 continue
602 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000603 name = nv[0].replace('+', ' ')
604 name = unquote(name, encoding=encoding, errors=errors)
605 name = _coerce_result(name)
606 value = nv[1].replace('+', ' ')
607 value = unquote(value, encoding=encoding, errors=errors)
608 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000609 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000610 return r
611
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000612def unquote_plus(string, encoding='utf-8', errors='replace'):
613 """Like unquote(), but also replace plus signs by spaces, as required for
614 unquoting HTML form values.
615
616 unquote_plus('%7e/abc+def') -> '~/abc def'
617 """
618 string = string.replace('+', ' ')
619 return unquote(string, encoding, errors)
620
621_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
622 b'abcdefghijklmnopqrstuvwxyz'
623 b'0123456789'
624 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000625_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
626_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000627
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000628class Quoter(collections.defaultdict):
629 """A mapping from bytes (in range(0,256)) to strings.
630
631 String values are percent-encoded byte values, unless the key < 128, and
632 in the "safe" set (either the specified safe set, or default set).
633 """
634 # Keeps a cache internally, using defaultdict, for efficiency (lookups
635 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000636 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000637 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000638 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000639
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000640 def __repr__(self):
641 # Without this, will just display as a defaultdict
642 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000643
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000644 def __missing__(self, b):
645 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000646 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000647 self[b] = res
648 return res
649
650def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000651 """quote('abc def') -> 'abc%20def'
652
653 Each part of a URL, e.g. the path info, the query, etc., has a
654 different set of reserved characters that must be quoted.
655
656 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
657 the following reserved characters.
658
659 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
660 "$" | ","
661
662 Each of these characters is reserved in some component of a URL,
663 but not necessarily in all of them.
664
665 By default, the quote function is intended for quoting the path
666 section of a URL. Thus, it will not encode '/'. This character
667 is reserved, but in typical usage the quote function is being
668 called on a path where the existing slash characters are used as
669 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000670
671 string and safe may be either str or bytes objects. encoding must
672 not be specified if string is a str.
673
674 The optional encoding and errors parameters specify how to deal with
675 non-ASCII characters, as accepted by the str.encode method.
676 By default, encoding='utf-8' (characters are encoded with UTF-8), and
677 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000678 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000679 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000680 if not string:
681 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000682 if encoding is None:
683 encoding = 'utf-8'
684 if errors is None:
685 errors = 'strict'
686 string = string.encode(encoding, errors)
687 else:
688 if encoding is not None:
689 raise TypeError("quote() doesn't support 'encoding' for bytes")
690 if errors is not None:
691 raise TypeError("quote() doesn't support 'errors' for bytes")
692 return quote_from_bytes(string, safe)
693
694def quote_plus(string, safe='', encoding=None, errors=None):
695 """Like quote(), but also replace ' ' with '+', as required for quoting
696 HTML form values. Plus signs in the original string are escaped unless
697 they are included in safe. It also does not have safe default to '/'.
698 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000699 # Check if ' ' in string, where string may either be a str or bytes. If
700 # there are no spaces, the regular quote will produce the right answer.
701 if ((isinstance(string, str) and ' ' not in string) or
702 (isinstance(string, bytes) and b' ' not in string)):
703 return quote(string, safe, encoding, errors)
704 if isinstance(safe, str):
705 space = ' '
706 else:
707 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000708 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000709 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000710
711def quote_from_bytes(bs, safe='/'):
712 """Like quote(), but accepts a bytes object rather than a str, and does
713 not perform string-to-bytes encoding. It always returns an ASCII string.
714 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
715 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000716 if not isinstance(bs, (bytes, bytearray)):
717 raise TypeError("quote_from_bytes() expected bytes")
718 if not bs:
719 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000720 if isinstance(safe, str):
721 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
722 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000723 else:
724 safe = bytes([c for c in safe if c < 128])
725 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
726 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000727 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000728 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000729 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000730 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
731 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000732
Senthil Kumarandf022da2010-07-03 17:48:22 +0000733def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000734 """Encode a sequence of two-element tuples or dictionary into a URL query string.
735
736 If any values in the query arg are sequences and doseq is true, each
737 sequence element is converted to a separate parameter.
738
739 If the query arg is a sequence of two-element tuples, the order of the
740 parameters in the output will match the order of parameters in the
741 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000742
743 The query arg may be either a string or a bytes type. When query arg is a
744 string, the safe, encoding and error parameters are sent the quote_plus for
745 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000746 """
747
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000748 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000749 query = query.items()
750 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000751 # It's a bother at times that strings and string-like objects are
752 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000753 try:
754 # non-sequence items should not work with len()
755 # non-empty strings will fail this
756 if len(query) and not isinstance(query[0], tuple):
757 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000758 # Zero-length sequences of all types will get here and succeed,
759 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000760 # allowed empty dicts that type of behavior probably should be
761 # preserved for consistency
762 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000763 ty, va, tb = sys.exc_info()
764 raise TypeError("not a valid non-string sequence "
765 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000766
767 l = []
768 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000769 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000770 if isinstance(k, bytes):
771 k = quote_plus(k, safe)
772 else:
773 k = quote_plus(str(k), safe, encoding, errors)
774
775 if isinstance(v, bytes):
776 v = quote_plus(v, safe)
777 else:
778 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000779 l.append(k + '=' + v)
780 else:
781 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000782 if isinstance(k, bytes):
783 k = quote_plus(k, safe)
784 else:
785 k = quote_plus(str(k), safe, encoding, errors)
786
787 if isinstance(v, bytes):
788 v = quote_plus(v, safe)
789 l.append(k + '=' + v)
790 elif isinstance(v, str):
791 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000792 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000793 else:
794 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000795 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000796 x = len(v)
797 except TypeError:
798 # not a sequence
Senthil Kumarandf022da2010-07-03 17:48:22 +0000799 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000800 l.append(k + '=' + v)
801 else:
802 # loop over the sequence
803 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000804 if isinstance(elt, bytes):
805 elt = quote_plus(elt, safe)
806 else:
807 elt = quote_plus(str(elt), safe, encoding, errors)
808 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000809 return '&'.join(l)
810
811# Utilities to parse URLs (most of these return None for missing parts):
812# unwrap('<URL:type://host/path>') --> 'type://host/path'
813# splittype('type:opaquestring') --> 'type', 'opaquestring'
814# splithost('//host[:port]/path') --> 'host[:port]', '/path'
815# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
816# splitpasswd('user:passwd') -> 'user', 'passwd'
817# splitport('host:port') --> 'host', 'port'
818# splitquery('/path?query') --> '/path', 'query'
819# splittag('/path#tag') --> '/path', 'tag'
820# splitattr('/path;attr1=value1;attr2=value2;...') ->
821# '/path', ['attr1=value1', 'attr2=value2', ...]
822# splitvalue('attr=value') --> 'attr', 'value'
823# urllib.parse.unquote('abc%20def') -> 'abc def'
824# quote('abc def') -> 'abc%20def')
825
Georg Brandl13e89462008-07-01 19:56:00 +0000826def to_bytes(url):
827 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000828 # Most URL schemes require ASCII. If that changes, the conversion
829 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000830 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000831 if isinstance(url, str):
832 try:
833 url = url.encode("ASCII").decode()
834 except UnicodeError:
835 raise UnicodeError("URL " + repr(url) +
836 " contains non-ASCII characters")
837 return url
838
839def unwrap(url):
840 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
841 url = str(url).strip()
842 if url[:1] == '<' and url[-1:] == '>':
843 url = url[1:-1].strip()
844 if url[:4] == 'URL:': url = url[4:].strip()
845 return url
846
847_typeprog = None
848def splittype(url):
849 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
850 global _typeprog
851 if _typeprog is None:
852 import re
853 _typeprog = re.compile('^([^/:]+):')
854
855 match = _typeprog.match(url)
856 if match:
857 scheme = match.group(1)
858 return scheme.lower(), url[len(scheme) + 1:]
859 return None, url
860
861_hostprog = None
862def splithost(url):
863 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
864 global _hostprog
865 if _hostprog is None:
866 import re
867 _hostprog = re.compile('^//([^/?]*)(.*)$')
868
869 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000870 if match:
871 host_port = match.group(1)
872 path = match.group(2)
873 if path and not path.startswith('/'):
874 path = '/' + path
875 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000876 return None, url
877
878_userprog = None
879def splituser(host):
880 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
881 global _userprog
882 if _userprog is None:
883 import re
884 _userprog = re.compile('^(.*)@(.*)$')
885
886 match = _userprog.match(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +0000887 if match: return match.group(1, 2)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000888 return None, host
889
890_passwdprog = None
891def splitpasswd(user):
892 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
893 global _passwdprog
894 if _passwdprog is None:
895 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000896 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000897
898 match = _passwdprog.match(user)
899 if match: return match.group(1, 2)
900 return user, None
901
902# splittag('/path#tag') --> '/path', 'tag'
903_portprog = None
904def splitport(host):
905 """splitport('host:port') --> 'host', 'port'."""
906 global _portprog
907 if _portprog is None:
908 import re
909 _portprog = re.compile('^(.*):([0-9]+)$')
910
911 match = _portprog.match(host)
912 if match: return match.group(1, 2)
913 return host, None
914
915_nportprog = None
916def splitnport(host, defport=-1):
917 """Split host and port, returning numeric port.
918 Return given default port if no ':' found; defaults to -1.
919 Return numerical port if a valid number are found after ':'.
920 Return None if ':' but not a valid number."""
921 global _nportprog
922 if _nportprog is None:
923 import re
924 _nportprog = re.compile('^(.*):(.*)$')
925
926 match = _nportprog.match(host)
927 if match:
928 host, port = match.group(1, 2)
929 try:
930 if not port: raise ValueError("no digits")
931 nport = int(port)
932 except ValueError:
933 nport = None
934 return host, nport
935 return host, defport
936
937_queryprog = None
938def splitquery(url):
939 """splitquery('/path?query') --> '/path', 'query'."""
940 global _queryprog
941 if _queryprog is None:
942 import re
943 _queryprog = re.compile('^(.*)\?([^?]*)$')
944
945 match = _queryprog.match(url)
946 if match: return match.group(1, 2)
947 return url, None
948
949_tagprog = None
950def splittag(url):
951 """splittag('/path#tag') --> '/path', 'tag'."""
952 global _tagprog
953 if _tagprog is None:
954 import re
955 _tagprog = re.compile('^(.*)#([^#]*)$')
956
957 match = _tagprog.match(url)
958 if match: return match.group(1, 2)
959 return url, None
960
961def splitattr(url):
962 """splitattr('/path;attr1=value1;attr2=value2;...') ->
963 '/path', ['attr1=value1', 'attr2=value2', ...]."""
964 words = url.split(';')
965 return words[0], words[1:]
966
967_valueprog = None
968def splitvalue(attr):
969 """splitvalue('attr=value') --> 'attr', 'value'."""
970 global _valueprog
971 if _valueprog is None:
972 import re
973 _valueprog = re.compile('^([^=]*)=(.*)$')
974
975 match = _valueprog.match(attr)
976 if match: return match.group(1, 2)
977 return attr, None