blob: ab5b356919a39001f26aed9e38f90d5ac055f169 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000034 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
35 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000036 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
41 'prospero', 'rtsp', 'rtspu', '', 'sftp']
42uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
43 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
44 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000045 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
47 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
48uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
49 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
50 'mms', '', 'sftp']
51uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
52 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
53uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
54 'nntp', 'wais', 'https', 'shttp', 'snews',
55 'file', 'prospero', '']
56
57# Characters valid in scheme names
58scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
59 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
60 '0123456789'
61 '+-.')
62
Nick Coghlan9fc443c2010-11-30 15:48:08 +000063# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064MAX_CACHE_SIZE = 20
65_parse_cache = {}
66
67def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000068 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000069 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000070 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000071
72
Nick Coghlan9fc443c2010-11-30 15:48:08 +000073# Helpers for bytes handling
74# For 3.2, we deliberately require applications that
75# handle improperly quoted URLs to do their own
76# decoding and encoding. If valid use cases are
77# presented, we may relax this by using latin-1
78# decoding internally for 3.3
79_implicit_encoding = 'ascii'
80_implicit_errors = 'strict'
81
82def _noop(obj):
83 return obj
84
85def _encode_result(obj, encoding=_implicit_encoding,
86 errors=_implicit_errors):
87 return obj.encode(encoding, errors)
88
89def _decode_args(args, encoding=_implicit_encoding,
90 errors=_implicit_errors):
91 return tuple(x.decode(encoding, errors) if x else '' for x in args)
92
93def _coerce_args(*args):
94 # Invokes decode if necessary to create str args
95 # and returns the coerced inputs along with
96 # an appropriate result coercion function
97 # - noop for str inputs
98 # - encoding function otherwise
99 str_input = isinstance(args[0], str)
100 for arg in args[1:]:
101 # We special-case the empty string to support the
102 # "scheme=''" default argument to some functions
103 if arg and isinstance(arg, str) != str_input:
104 raise TypeError("Cannot mix str and non-str arguments")
105 if str_input:
106 return args + (_noop,)
107 return _decode_args(args) + (_encode_result,)
108
109# Result objects are more helpful than simple tuples
110class _ResultMixinStr(object):
111 """Standard approach to encoding parsed results from str to bytes"""
112 __slots__ = ()
113
114 def encode(self, encoding='ascii', errors='strict'):
115 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
116
117
118class _ResultMixinBytes(object):
119 """Standard approach to decoding parsed results from bytes to str"""
120 __slots__ = ()
121
122 def decode(self, encoding='ascii', errors='strict'):
123 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
124
125
126class _NetlocResultMixinBase(object):
127 """Shared methods for the parsed result objects containing a netloc element"""
128 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000129
130 @property
131 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000132 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000133
134 @property
135 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000136 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000137
138 @property
139 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000140 hostname = self._hostinfo[0]
141 if not hostname:
142 hostname = None
143 elif hostname is not None:
144 hostname = hostname.lower()
145 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000146
147 @property
148 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000149 port = self._hostinfo[1]
150 if port is not None:
151 port = int(port, 10)
152 return port
153
154
155class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
156 __slots__ = ()
157
158 @property
159 def _userinfo(self):
160 netloc = self.netloc
161 userinfo, have_info, hostinfo = netloc.rpartition('@')
162 if have_info:
163 username, have_password, password = userinfo.partition(':')
164 if not have_password:
165 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000166 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000167 username = password = None
168 return username, password
169
170 @property
171 def _hostinfo(self):
172 netloc = self.netloc
173 _, _, hostinfo = netloc.rpartition('@')
174 _, have_open_br, bracketed = hostinfo.partition('[')
175 if have_open_br:
176 hostname, _, port = bracketed.partition(']')
177 _, have_port, port = port.partition(':')
178 else:
179 hostname, have_port, port = hostinfo.partition(':')
180 if not have_port:
181 port = None
182 return hostname, port
183
184
185class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
186 __slots__ = ()
187
188 @property
189 def _userinfo(self):
190 netloc = self.netloc
191 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
192 if have_info:
193 username, have_password, password = userinfo.partition(b':')
194 if not have_password:
195 password = None
196 else:
197 username = password = None
198 return username, password
199
200 @property
201 def _hostinfo(self):
202 netloc = self.netloc
203 _, _, hostinfo = netloc.rpartition(b'@')
204 _, have_open_br, bracketed = hostinfo.partition(b'[')
205 if have_open_br:
206 hostname, _, port = bracketed.partition(b']')
207 _, have_port, port = port.partition(b':')
208 else:
209 hostname, have_port, port = hostinfo.partition(b':')
210 if not have_port:
211 port = None
212 return hostname, port
213
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000214
215from collections import namedtuple
216
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000217_DefragResultBase = namedtuple('DefragResult', 'url fragment')
218_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
219_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000220
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000221# For backwards compatibility, alias _NetlocResultMixinStr
222# ResultBase is no longer part of the documented API, but it is
223# retained since deprecating it isn't worth the hassle
224ResultBase = _NetlocResultMixinStr
225
226# Structured result objects for string data
227class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000228 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000229 def geturl(self):
230 if self.fragment:
231 return self.url + '#' + self.fragment
232 else:
233 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000234
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000235class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
236 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000237 def geturl(self):
238 return urlunsplit(self)
239
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000240class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000241 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000242 def geturl(self):
243 return urlunparse(self)
244
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000245# Structured result objects for bytes data
246class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
247 __slots__ = ()
248 def geturl(self):
249 if self.fragment:
250 return self.url + b'#' + self.fragment
251 else:
252 return self.url
253
254class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
255 __slots__ = ()
256 def geturl(self):
257 return urlunsplit(self)
258
259class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
260 __slots__ = ()
261 def geturl(self):
262 return urlunparse(self)
263
264# Set up the encode/decode result pairs
265def _fix_result_transcoding():
266 _result_pairs = (
267 (DefragResult, DefragResultBytes),
268 (SplitResult, SplitResultBytes),
269 (ParseResult, ParseResultBytes),
270 )
271 for _decoded, _encoded in _result_pairs:
272 _decoded._encoded_counterpart = _encoded
273 _encoded._decoded_counterpart = _decoded
274
275_fix_result_transcoding()
276del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000277
278def urlparse(url, scheme='', allow_fragments=True):
279 """Parse a URL into 6 components:
280 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
281 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
282 Note that we don't break the components up in smaller bits
283 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000284 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000285 tuple = urlsplit(url, scheme, allow_fragments)
286 scheme, netloc, url, query, fragment = tuple
287 if scheme in uses_params and ';' in url:
288 url, params = _splitparams(url)
289 else:
290 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000291 result = ParseResult(scheme, netloc, url, params, query, fragment)
292 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000293
294def _splitparams(url):
295 if '/' in url:
296 i = url.find(';', url.rfind('/'))
297 if i < 0:
298 return url, ''
299 else:
300 i = url.find(';')
301 return url[:i], url[i+1:]
302
303def _splitnetloc(url, start=0):
304 delim = len(url) # position of end of domain part of url, default is end
305 for c in '/?#': # look for delimiters; the order is NOT important
306 wdelim = url.find(c, start) # find first of this delim
307 if wdelim >= 0: # if found
308 delim = min(delim, wdelim) # use earliest delim position
309 return url[start:delim], url[delim:] # return (domain, rest)
310
311def urlsplit(url, scheme='', allow_fragments=True):
312 """Parse a URL into 5 components:
313 <scheme>://<netloc>/<path>?<query>#<fragment>
314 Return a 5-tuple: (scheme, netloc, path, query, fragment).
315 Note that we don't break the components up in smaller bits
316 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000317 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000318 allow_fragments = bool(allow_fragments)
319 key = url, scheme, allow_fragments, type(url), type(scheme)
320 cached = _parse_cache.get(key, None)
321 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000322 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000323 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
324 clear_cache()
325 netloc = query = fragment = ''
326 i = url.find(':')
327 if i > 0:
328 if url[:i] == 'http': # optimize the common case
329 scheme = url[:i].lower()
330 url = url[i+1:]
331 if url[:2] == '//':
332 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000333 if (('[' in netloc and ']' not in netloc) or
334 (']' in netloc and '[' not in netloc)):
335 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000336 if allow_fragments and '#' in url:
337 url, fragment = url.split('#', 1)
338 if '?' in url:
339 url, query = url.split('?', 1)
340 v = SplitResult(scheme, netloc, url, query, fragment)
341 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000342 return _coerce_result(v)
Senthil Kumaran84c7d9f2010-08-04 04:50:44 +0000343 if url.endswith(':') or not url[i+1].isdigit():
344 for c in url[:i]:
345 if c not in scheme_chars:
346 break
347 else:
348 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000349 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000350 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000351 if (('[' in netloc and ']' not in netloc) or
352 (']' in netloc and '[' not in netloc)):
353 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000354 if allow_fragments and scheme in uses_fragment and '#' in url:
355 url, fragment = url.split('#', 1)
356 if scheme in uses_query and '?' in url:
357 url, query = url.split('?', 1)
358 v = SplitResult(scheme, netloc, url, query, fragment)
359 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000360 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000361
362def urlunparse(components):
363 """Put a parsed URL back together again. This may result in a
364 slightly different, but equivalent URL, if the URL that was parsed
365 originally had redundant delimiters, e.g. a ? with an empty query
366 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000367 scheme, netloc, url, params, query, fragment, _coerce_result = (
368 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000369 if params:
370 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000371 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000372
373def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000374 """Combine the elements of a tuple as returned by urlsplit() into a
375 complete URL as a string. The data argument can be any five-item iterable.
376 This may result in a slightly different, but equivalent URL, if the URL that
377 was parsed originally had unnecessary delimiters (for example, a ? with an
378 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000379 scheme, netloc, url, query, fragment, _coerce_result = (
380 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000381 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
382 if url and url[:1] != '/': url = '/' + url
383 url = '//' + (netloc or '') + url
384 if scheme:
385 url = scheme + ':' + url
386 if query:
387 url = url + '?' + query
388 if fragment:
389 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000390 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000391
392def urljoin(base, url, allow_fragments=True):
393 """Join a base URL and a possibly relative URL to form an absolute
394 interpretation of the latter."""
395 if not base:
396 return url
397 if not url:
398 return base
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000399 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000400 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
401 urlparse(base, '', allow_fragments)
402 scheme, netloc, path, params, query, fragment = \
403 urlparse(url, bscheme, allow_fragments)
404 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000405 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000406 if scheme in uses_netloc:
407 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000408 return _coerce_result(urlunparse((scheme, netloc, path,
409 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000410 netloc = bnetloc
411 if path[:1] == '/':
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000412 return _coerce_result(urlunparse((scheme, netloc, path,
413 params, query, fragment)))
Facundo Batista23e38562008-08-14 16:55:14 +0000414 if not path:
415 path = bpath
416 if not params:
417 params = bparams
418 else:
419 path = path[:-1]
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000420 return _coerce_result(urlunparse((scheme, netloc, path,
421 params, query, fragment)))
Facundo Batista23e38562008-08-14 16:55:14 +0000422 if not query:
423 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000424 return _coerce_result(urlunparse((scheme, netloc, path,
425 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000426 segments = bpath.split('/')[:-1] + path.split('/')
427 # XXX The stuff below is bogus in various ways...
428 if segments[-1] == '.':
429 segments[-1] = ''
430 while '.' in segments:
431 segments.remove('.')
432 while 1:
433 i = 1
434 n = len(segments) - 1
435 while i < n:
436 if (segments[i] == '..'
437 and segments[i-1] not in ('', '..')):
438 del segments[i-1:i+1]
439 break
440 i = i+1
441 else:
442 break
443 if segments == ['', '..']:
444 segments[-1] = ''
445 elif len(segments) >= 2 and segments[-1] == '..':
446 segments[-2:] = ['']
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000447 return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),
448 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000449
450def urldefrag(url):
451 """Removes any existing fragment from URL.
452
453 Returns a tuple of the defragmented URL and the fragment. If
454 the URL contained no fragments, the second element is the
455 empty string.
456 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000457 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000458 if '#' in url:
459 s, n, p, a, q, frag = urlparse(url)
460 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000461 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000462 frag = ''
463 defrag = url
464 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000465
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000466def unquote_to_bytes(string):
467 """unquote_to_bytes('abc%20def') -> b'abc def'."""
468 # Note: strings are encoded as UTF-8. This is only an issue if it contains
469 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000470 if not string:
471 # Is it a string-like object?
472 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000473 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000474 if isinstance(string, str):
475 string = string.encode('utf-8')
476 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000477 if len(res) == 1:
478 return string
479 string = res[0]
480 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000481 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000482 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000483 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000484 string += b'%' + item
485 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000486
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000487def unquote(string, encoding='utf-8', errors='replace'):
488 """Replace %xx escapes by their single-character equivalent. The optional
489 encoding and errors parameters specify how to decode percent-encoded
490 sequences into Unicode characters, as accepted by the bytes.decode()
491 method.
492 By default, percent-encoded sequences are decoded with UTF-8, and invalid
493 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000494
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000495 unquote('abc%20def') -> 'abc def'.
496 """
Florent Xiclunac049fca2010-07-31 08:56:55 +0000497 if string == '':
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000498 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000499 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000500 if len(res) == 1:
501 return string
502 if encoding is None:
503 encoding = 'utf-8'
504 if errors is None:
505 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000506 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000507 pct_sequence = b''
508 string = res[0]
509 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000510 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000511 if not item:
512 raise ValueError
513 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000514 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000515 if not rest:
516 # This segment was just a single percent-encoded character.
517 # May be part of a sequence of code units, so delay decoding.
518 # (Stored in pct_sequence).
519 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000520 except ValueError:
521 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000522 # Encountered non-percent-encoded characters. Flush the current
523 # pct_sequence.
524 string += pct_sequence.decode(encoding, errors) + rest
525 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000526 if pct_sequence:
527 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000528 string += pct_sequence.decode(encoding, errors)
529 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000530
Georg Brandl3d6575d2009-09-16 14:36:22 +0000531def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000532 """Parse a query given as a string argument.
533
534 Arguments:
535
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000536 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000537
538 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000539 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000540 A true value indicates that blanks should be retained as
541 blank strings. The default false value indicates that
542 blank values are to be ignored and treated as if they were
543 not included.
544
545 strict_parsing: flag indicating what to do with parsing errors.
546 If false (the default), errors are silently ignored.
547 If true, errors raise a ValueError exception.
548 """
549 dict = {}
550 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
551 if name in dict:
552 dict[name].append(value)
553 else:
554 dict[name] = [value]
555 return dict
556
Georg Brandl3d6575d2009-09-16 14:36:22 +0000557def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000558 """Parse a query given as a string argument.
559
560 Arguments:
561
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000562 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000563
564 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000565 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000566 true value indicates that blanks should be retained as blank
567 strings. The default false value indicates that blank values
568 are to be ignored and treated as if they were not included.
569
570 strict_parsing: flag indicating what to do with parsing errors. If
571 false (the default), errors are silently ignored. If true,
572 errors raise a ValueError exception.
573
574 Returns a list, as G-d intended.
575 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000576 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000577 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
578 r = []
579 for name_value in pairs:
580 if not name_value and not strict_parsing:
581 continue
582 nv = name_value.split('=', 1)
583 if len(nv) != 2:
584 if strict_parsing:
585 raise ValueError("bad query field: %r" % (name_value,))
586 # Handle case of a control-name with no equal sign
587 if keep_blank_values:
588 nv.append('')
589 else:
590 continue
591 if len(nv[1]) or keep_blank_values:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000592 name = _coerce_result(unquote(nv[0].replace('+', ' ')))
593 value = _coerce_result(unquote(nv[1].replace('+', ' ')))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000594 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000595 return r
596
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000597def unquote_plus(string, encoding='utf-8', errors='replace'):
598 """Like unquote(), but also replace plus signs by spaces, as required for
599 unquoting HTML form values.
600
601 unquote_plus('%7e/abc+def') -> '~/abc def'
602 """
603 string = string.replace('+', ' ')
604 return unquote(string, encoding, errors)
605
606_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
607 b'abcdefghijklmnopqrstuvwxyz'
608 b'0123456789'
609 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000610_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
611_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000612
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000613class Quoter(collections.defaultdict):
614 """A mapping from bytes (in range(0,256)) to strings.
615
616 String values are percent-encoded byte values, unless the key < 128, and
617 in the "safe" set (either the specified safe set, or default set).
618 """
619 # Keeps a cache internally, using defaultdict, for efficiency (lookups
620 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000621 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000622 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000623 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000624
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000625 def __repr__(self):
626 # Without this, will just display as a defaultdict
627 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000628
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000629 def __missing__(self, b):
630 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000631 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000632 self[b] = res
633 return res
634
635def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000636 """quote('abc def') -> 'abc%20def'
637
638 Each part of a URL, e.g. the path info, the query, etc., has a
639 different set of reserved characters that must be quoted.
640
641 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
642 the following reserved characters.
643
644 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
645 "$" | ","
646
647 Each of these characters is reserved in some component of a URL,
648 but not necessarily in all of them.
649
650 By default, the quote function is intended for quoting the path
651 section of a URL. Thus, it will not encode '/'. This character
652 is reserved, but in typical usage the quote function is being
653 called on a path where the existing slash characters are used as
654 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000655
656 string and safe may be either str or bytes objects. encoding must
657 not be specified if string is a str.
658
659 The optional encoding and errors parameters specify how to deal with
660 non-ASCII characters, as accepted by the str.encode method.
661 By default, encoding='utf-8' (characters are encoded with UTF-8), and
662 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000663 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000664 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000665 if not string:
666 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000667 if encoding is None:
668 encoding = 'utf-8'
669 if errors is None:
670 errors = 'strict'
671 string = string.encode(encoding, errors)
672 else:
673 if encoding is not None:
674 raise TypeError("quote() doesn't support 'encoding' for bytes")
675 if errors is not None:
676 raise TypeError("quote() doesn't support 'errors' for bytes")
677 return quote_from_bytes(string, safe)
678
679def quote_plus(string, safe='', encoding=None, errors=None):
680 """Like quote(), but also replace ' ' with '+', as required for quoting
681 HTML form values. Plus signs in the original string are escaped unless
682 they are included in safe. It also does not have safe default to '/'.
683 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000684 # Check if ' ' in string, where string may either be a str or bytes. If
685 # there are no spaces, the regular quote will produce the right answer.
686 if ((isinstance(string, str) and ' ' not in string) or
687 (isinstance(string, bytes) and b' ' not in string)):
688 return quote(string, safe, encoding, errors)
689 if isinstance(safe, str):
690 space = ' '
691 else:
692 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000693 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000694 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000695
696def quote_from_bytes(bs, safe='/'):
697 """Like quote(), but accepts a bytes object rather than a str, and does
698 not perform string-to-bytes encoding. It always returns an ASCII string.
699 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
700 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000701 if not isinstance(bs, (bytes, bytearray)):
702 raise TypeError("quote_from_bytes() expected bytes")
703 if not bs:
704 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000705 if isinstance(safe, str):
706 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
707 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000708 else:
709 safe = bytes([c for c in safe if c < 128])
710 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
711 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000712 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000713 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000714 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000715 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
716 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000717
Senthil Kumarandf022da2010-07-03 17:48:22 +0000718def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000719 """Encode a sequence of two-element tuples or dictionary into a URL query string.
720
721 If any values in the query arg are sequences and doseq is true, each
722 sequence element is converted to a separate parameter.
723
724 If the query arg is a sequence of two-element tuples, the order of the
725 parameters in the output will match the order of parameters in the
726 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000727
728 The query arg may be either a string or a bytes type. When query arg is a
729 string, the safe, encoding and error parameters are sent the quote_plus for
730 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000731 """
732
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000733 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000734 query = query.items()
735 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000736 # It's a bother at times that strings and string-like objects are
737 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000738 try:
739 # non-sequence items should not work with len()
740 # non-empty strings will fail this
741 if len(query) and not isinstance(query[0], tuple):
742 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000743 # Zero-length sequences of all types will get here and succeed,
744 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000745 # allowed empty dicts that type of behavior probably should be
746 # preserved for consistency
747 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000748 ty, va, tb = sys.exc_info()
749 raise TypeError("not a valid non-string sequence "
750 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000751
752 l = []
753 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000754 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000755 if isinstance(k, bytes):
756 k = quote_plus(k, safe)
757 else:
758 k = quote_plus(str(k), safe, encoding, errors)
759
760 if isinstance(v, bytes):
761 v = quote_plus(v, safe)
762 else:
763 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000764 l.append(k + '=' + v)
765 else:
766 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000767 if isinstance(k, bytes):
768 k = quote_plus(k, safe)
769 else:
770 k = quote_plus(str(k), safe, encoding, errors)
771
772 if isinstance(v, bytes):
773 v = quote_plus(v, safe)
774 l.append(k + '=' + v)
775 elif isinstance(v, str):
776 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000777 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000778 else:
779 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000780 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000781 x = len(v)
782 except TypeError:
783 # not a sequence
Senthil Kumarandf022da2010-07-03 17:48:22 +0000784 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000785 l.append(k + '=' + v)
786 else:
787 # loop over the sequence
788 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000789 if isinstance(elt, bytes):
790 elt = quote_plus(elt, safe)
791 else:
792 elt = quote_plus(str(elt), safe, encoding, errors)
793 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000794 return '&'.join(l)
795
796# Utilities to parse URLs (most of these return None for missing parts):
797# unwrap('<URL:type://host/path>') --> 'type://host/path'
798# splittype('type:opaquestring') --> 'type', 'opaquestring'
799# splithost('//host[:port]/path') --> 'host[:port]', '/path'
800# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
801# splitpasswd('user:passwd') -> 'user', 'passwd'
802# splitport('host:port') --> 'host', 'port'
803# splitquery('/path?query') --> '/path', 'query'
804# splittag('/path#tag') --> '/path', 'tag'
805# splitattr('/path;attr1=value1;attr2=value2;...') ->
806# '/path', ['attr1=value1', 'attr2=value2', ...]
807# splitvalue('attr=value') --> 'attr', 'value'
808# urllib.parse.unquote('abc%20def') -> 'abc def'
809# quote('abc def') -> 'abc%20def')
810
Georg Brandl13e89462008-07-01 19:56:00 +0000811def to_bytes(url):
812 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 # Most URL schemes require ASCII. If that changes, the conversion
814 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000815 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000816 if isinstance(url, str):
817 try:
818 url = url.encode("ASCII").decode()
819 except UnicodeError:
820 raise UnicodeError("URL " + repr(url) +
821 " contains non-ASCII characters")
822 return url
823
824def unwrap(url):
825 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
826 url = str(url).strip()
827 if url[:1] == '<' and url[-1:] == '>':
828 url = url[1:-1].strip()
829 if url[:4] == 'URL:': url = url[4:].strip()
830 return url
831
832_typeprog = None
833def splittype(url):
834 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
835 global _typeprog
836 if _typeprog is None:
837 import re
838 _typeprog = re.compile('^([^/:]+):')
839
840 match = _typeprog.match(url)
841 if match:
842 scheme = match.group(1)
843 return scheme.lower(), url[len(scheme) + 1:]
844 return None, url
845
846_hostprog = None
847def splithost(url):
848 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
849 global _hostprog
850 if _hostprog is None:
851 import re
852 _hostprog = re.compile('^//([^/?]*)(.*)$')
853
854 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000855 if match:
856 host_port = match.group(1)
857 path = match.group(2)
858 if path and not path.startswith('/'):
859 path = '/' + path
860 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000861 return None, url
862
863_userprog = None
864def splituser(host):
865 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
866 global _userprog
867 if _userprog is None:
868 import re
869 _userprog = re.compile('^(.*)@(.*)$')
870
871 match = _userprog.match(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +0000872 if match: return match.group(1, 2)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000873 return None, host
874
875_passwdprog = None
876def splitpasswd(user):
877 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
878 global _passwdprog
879 if _passwdprog is None:
880 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000881 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000882
883 match = _passwdprog.match(user)
884 if match: return match.group(1, 2)
885 return user, None
886
887# splittag('/path#tag') --> '/path', 'tag'
888_portprog = None
889def splitport(host):
890 """splitport('host:port') --> 'host', 'port'."""
891 global _portprog
892 if _portprog is None:
893 import re
894 _portprog = re.compile('^(.*):([0-9]+)$')
895
896 match = _portprog.match(host)
897 if match: return match.group(1, 2)
898 return host, None
899
900_nportprog = None
901def splitnport(host, defport=-1):
902 """Split host and port, returning numeric port.
903 Return given default port if no ':' found; defaults to -1.
904 Return numerical port if a valid number are found after ':'.
905 Return None if ':' but not a valid number."""
906 global _nportprog
907 if _nportprog is None:
908 import re
909 _nportprog = re.compile('^(.*):(.*)$')
910
911 match = _nportprog.match(host)
912 if match:
913 host, port = match.group(1, 2)
914 try:
915 if not port: raise ValueError("no digits")
916 nport = int(port)
917 except ValueError:
918 nport = None
919 return host, nport
920 return host, defport
921
922_queryprog = None
923def splitquery(url):
924 """splitquery('/path?query') --> '/path', 'query'."""
925 global _queryprog
926 if _queryprog is None:
927 import re
928 _queryprog = re.compile('^(.*)\?([^?]*)$')
929
930 match = _queryprog.match(url)
931 if match: return match.group(1, 2)
932 return url, None
933
934_tagprog = None
935def splittag(url):
936 """splittag('/path#tag') --> '/path', 'tag'."""
937 global _tagprog
938 if _tagprog is None:
939 import re
940 _tagprog = re.compile('^(.*)#([^#]*)$')
941
942 match = _tagprog.match(url)
943 if match: return match.group(1, 2)
944 return url, None
945
946def splitattr(url):
947 """splitattr('/path;attr1=value1;attr2=value2;...') ->
948 '/path', ['attr1=value1', 'attr2=value2', ...]."""
949 words = url.split(';')
950 return words[0], words[1:]
951
952_valueprog = None
953def splitvalue(attr):
954 """splitvalue('attr=value') --> 'attr', 'value'."""
955 global _valueprog
956 if _valueprog is None:
957 import re
958 _valueprog = re.compile('^([^=]*)=(.*)$')
959
960 match = _valueprog.match(attr)
961 if match: return match.group(1, 2)
962 return attr, None