blob: 42f81936828d4cb9507fc8c3a8098ab8eb7d6743 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000034 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
35 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000036 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
41 'prospero', 'rtsp', 'rtspu', '', 'sftp']
42uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
43 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
44 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000045 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
47 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
48uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
49 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
50 'mms', '', 'sftp']
51uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
52 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
53uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
54 'nntp', 'wais', 'https', 'shttp', 'snews',
55 'file', 'prospero', '']
56
57# Characters valid in scheme names
58scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
59 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
60 '0123456789'
61 '+-.')
62
Nick Coghlan9fc443c2010-11-30 15:48:08 +000063# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064MAX_CACHE_SIZE = 20
65_parse_cache = {}
66
67def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000068 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000069 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000070 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000071
72
Nick Coghlan9fc443c2010-11-30 15:48:08 +000073# Helpers for bytes handling
74# For 3.2, we deliberately require applications that
75# handle improperly quoted URLs to do their own
76# decoding and encoding. If valid use cases are
77# presented, we may relax this by using latin-1
78# decoding internally for 3.3
79_implicit_encoding = 'ascii'
80_implicit_errors = 'strict'
81
82def _noop(obj):
83 return obj
84
85def _encode_result(obj, encoding=_implicit_encoding,
86 errors=_implicit_errors):
87 return obj.encode(encoding, errors)
88
89def _decode_args(args, encoding=_implicit_encoding,
90 errors=_implicit_errors):
91 return tuple(x.decode(encoding, errors) if x else '' for x in args)
92
93def _coerce_args(*args):
94 # Invokes decode if necessary to create str args
95 # and returns the coerced inputs along with
96 # an appropriate result coercion function
97 # - noop for str inputs
98 # - encoding function otherwise
99 str_input = isinstance(args[0], str)
100 for arg in args[1:]:
101 # We special-case the empty string to support the
102 # "scheme=''" default argument to some functions
103 if arg and isinstance(arg, str) != str_input:
104 raise TypeError("Cannot mix str and non-str arguments")
105 if str_input:
106 return args + (_noop,)
107 return _decode_args(args) + (_encode_result,)
108
109# Result objects are more helpful than simple tuples
110class _ResultMixinStr(object):
111 """Standard approach to encoding parsed results from str to bytes"""
112 __slots__ = ()
113
114 def encode(self, encoding='ascii', errors='strict'):
115 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
116
117
118class _ResultMixinBytes(object):
119 """Standard approach to decoding parsed results from bytes to str"""
120 __slots__ = ()
121
122 def decode(self, encoding='ascii', errors='strict'):
123 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
124
125
126class _NetlocResultMixinBase(object):
127 """Shared methods for the parsed result objects containing a netloc element"""
128 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000129
130 @property
131 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000132 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000133
134 @property
135 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000136 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000137
138 @property
139 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000140 hostname = self._hostinfo[0]
141 if not hostname:
142 hostname = None
143 elif hostname is not None:
144 hostname = hostname.lower()
145 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000146
147 @property
148 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000149 port = self._hostinfo[1]
150 if port is not None:
151 port = int(port, 10)
152 return port
153
154
155class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
156 __slots__ = ()
157
158 @property
159 def _userinfo(self):
160 netloc = self.netloc
161 userinfo, have_info, hostinfo = netloc.rpartition('@')
162 if have_info:
163 username, have_password, password = userinfo.partition(':')
164 if not have_password:
165 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000166 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000167 username = password = None
168 return username, password
169
170 @property
171 def _hostinfo(self):
172 netloc = self.netloc
173 _, _, hostinfo = netloc.rpartition('@')
174 _, have_open_br, bracketed = hostinfo.partition('[')
175 if have_open_br:
176 hostname, _, port = bracketed.partition(']')
177 _, have_port, port = port.partition(':')
178 else:
179 hostname, have_port, port = hostinfo.partition(':')
180 if not have_port:
181 port = None
182 return hostname, port
183
184
185class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
186 __slots__ = ()
187
188 @property
189 def _userinfo(self):
190 netloc = self.netloc
191 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
192 if have_info:
193 username, have_password, password = userinfo.partition(b':')
194 if not have_password:
195 password = None
196 else:
197 username = password = None
198 return username, password
199
200 @property
201 def _hostinfo(self):
202 netloc = self.netloc
203 _, _, hostinfo = netloc.rpartition(b'@')
204 _, have_open_br, bracketed = hostinfo.partition(b'[')
205 if have_open_br:
206 hostname, _, port = bracketed.partition(b']')
207 _, have_port, port = port.partition(b':')
208 else:
209 hostname, have_port, port = hostinfo.partition(b':')
210 if not have_port:
211 port = None
212 return hostname, port
213
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000214
215from collections import namedtuple
216
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000217_DefragResultBase = namedtuple('DefragResult', 'url fragment')
218_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
219_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000220
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000221# For backwards compatibility, alias _NetlocResultMixinStr
222# ResultBase is no longer part of the documented API, but it is
223# retained since deprecating it isn't worth the hassle
224ResultBase = _NetlocResultMixinStr
225
226# Structured result objects for string data
227class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000228 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000229 def geturl(self):
230 if self.fragment:
231 return self.url + '#' + self.fragment
232 else:
233 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000234
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000235class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
236 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000237 def geturl(self):
238 return urlunsplit(self)
239
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000240class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000241 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000242 def geturl(self):
243 return urlunparse(self)
244
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000245# Structured result objects for bytes data
246class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
247 __slots__ = ()
248 def geturl(self):
249 if self.fragment:
250 return self.url + b'#' + self.fragment
251 else:
252 return self.url
253
254class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
255 __slots__ = ()
256 def geturl(self):
257 return urlunsplit(self)
258
259class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
260 __slots__ = ()
261 def geturl(self):
262 return urlunparse(self)
263
264# Set up the encode/decode result pairs
265def _fix_result_transcoding():
266 _result_pairs = (
267 (DefragResult, DefragResultBytes),
268 (SplitResult, SplitResultBytes),
269 (ParseResult, ParseResultBytes),
270 )
271 for _decoded, _encoded in _result_pairs:
272 _decoded._encoded_counterpart = _encoded
273 _encoded._decoded_counterpart = _decoded
274
275_fix_result_transcoding()
276del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000277
278def urlparse(url, scheme='', allow_fragments=True):
279 """Parse a URL into 6 components:
280 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
281 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
282 Note that we don't break the components up in smaller bits
283 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000284 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000285 tuple = urlsplit(url, scheme, allow_fragments)
286 scheme, netloc, url, query, fragment = tuple
287 if scheme in uses_params and ';' in url:
288 url, params = _splitparams(url)
289 else:
290 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000291 result = ParseResult(scheme, netloc, url, params, query, fragment)
292 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000293
294def _splitparams(url):
295 if '/' in url:
296 i = url.find(';', url.rfind('/'))
297 if i < 0:
298 return url, ''
299 else:
300 i = url.find(';')
301 return url[:i], url[i+1:]
302
303def _splitnetloc(url, start=0):
304 delim = len(url) # position of end of domain part of url, default is end
305 for c in '/?#': # look for delimiters; the order is NOT important
306 wdelim = url.find(c, start) # find first of this delim
307 if wdelim >= 0: # if found
308 delim = min(delim, wdelim) # use earliest delim position
309 return url[start:delim], url[delim:] # return (domain, rest)
310
311def urlsplit(url, scheme='', allow_fragments=True):
312 """Parse a URL into 5 components:
313 <scheme>://<netloc>/<path>?<query>#<fragment>
314 Return a 5-tuple: (scheme, netloc, path, query, fragment).
315 Note that we don't break the components up in smaller bits
316 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000317 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000318 allow_fragments = bool(allow_fragments)
319 key = url, scheme, allow_fragments, type(url), type(scheme)
320 cached = _parse_cache.get(key, None)
321 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000322 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000323 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
324 clear_cache()
325 netloc = query = fragment = ''
326 i = url.find(':')
327 if i > 0:
328 if url[:i] == 'http': # optimize the common case
329 scheme = url[:i].lower()
330 url = url[i+1:]
331 if url[:2] == '//':
332 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000333 if (('[' in netloc and ']' not in netloc) or
334 (']' in netloc and '[' not in netloc)):
335 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000336 if allow_fragments and '#' in url:
337 url, fragment = url.split('#', 1)
338 if '?' in url:
339 url, query = url.split('?', 1)
340 v = SplitResult(scheme, netloc, url, query, fragment)
341 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000342 return _coerce_result(v)
Senthil Kumaran84c7d9f2010-08-04 04:50:44 +0000343 if url.endswith(':') or not url[i+1].isdigit():
344 for c in url[:i]:
345 if c not in scheme_chars:
346 break
347 else:
348 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000349 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000350 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000351 if (('[' in netloc and ']' not in netloc) or
352 (']' in netloc and '[' not in netloc)):
353 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000354 if allow_fragments and scheme in uses_fragment and '#' in url:
355 url, fragment = url.split('#', 1)
356 if scheme in uses_query and '?' in url:
357 url, query = url.split('?', 1)
358 v = SplitResult(scheme, netloc, url, query, fragment)
359 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000360 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000361
362def urlunparse(components):
363 """Put a parsed URL back together again. This may result in a
364 slightly different, but equivalent URL, if the URL that was parsed
365 originally had redundant delimiters, e.g. a ? with an empty query
366 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000367 scheme, netloc, url, params, query, fragment, _coerce_result = (
368 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000369 if params:
370 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000371 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000372
373def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000374 """Combine the elements of a tuple as returned by urlsplit() into a
375 complete URL as a string. The data argument can be any five-item iterable.
376 This may result in a slightly different, but equivalent URL, if the URL that
377 was parsed originally had unnecessary delimiters (for example, a ? with an
378 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000379 scheme, netloc, url, query, fragment, _coerce_result = (
380 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000381 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
382 if url and url[:1] != '/': url = '/' + url
383 url = '//' + (netloc or '') + url
384 if scheme:
385 url = scheme + ':' + url
386 if query:
387 url = url + '?' + query
388 if fragment:
389 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000390 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000391
392def urljoin(base, url, allow_fragments=True):
393 """Join a base URL and a possibly relative URL to form an absolute
394 interpretation of the latter."""
395 if not base:
396 return url
397 if not url:
398 return base
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000399 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000400 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
401 urlparse(base, '', allow_fragments)
402 scheme, netloc, path, params, query, fragment = \
403 urlparse(url, bscheme, allow_fragments)
404 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000405 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000406 if scheme in uses_netloc:
407 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000408 return _coerce_result(urlunparse((scheme, netloc, path,
409 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000410 netloc = bnetloc
411 if path[:1] == '/':
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000412 return _coerce_result(urlunparse((scheme, netloc, path,
413 params, query, fragment)))
Senthil Kumarandca5b862010-12-17 04:48:45 +0000414 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000415 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000416 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000417 if not query:
418 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000419 return _coerce_result(urlunparse((scheme, netloc, path,
420 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000421 segments = bpath.split('/')[:-1] + path.split('/')
422 # XXX The stuff below is bogus in various ways...
423 if segments[-1] == '.':
424 segments[-1] = ''
425 while '.' in segments:
426 segments.remove('.')
427 while 1:
428 i = 1
429 n = len(segments) - 1
430 while i < n:
431 if (segments[i] == '..'
432 and segments[i-1] not in ('', '..')):
433 del segments[i-1:i+1]
434 break
435 i = i+1
436 else:
437 break
438 if segments == ['', '..']:
439 segments[-1] = ''
440 elif len(segments) >= 2 and segments[-1] == '..':
441 segments[-2:] = ['']
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000442 return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),
443 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000444
445def urldefrag(url):
446 """Removes any existing fragment from URL.
447
448 Returns a tuple of the defragmented URL and the fragment. If
449 the URL contained no fragments, the second element is the
450 empty string.
451 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000452 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000453 if '#' in url:
454 s, n, p, a, q, frag = urlparse(url)
455 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000456 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000457 frag = ''
458 defrag = url
459 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000460
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000461def unquote_to_bytes(string):
462 """unquote_to_bytes('abc%20def') -> b'abc def'."""
463 # Note: strings are encoded as UTF-8. This is only an issue if it contains
464 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000465 if not string:
466 # Is it a string-like object?
467 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000468 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000469 if isinstance(string, str):
470 string = string.encode('utf-8')
471 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000472 if len(res) == 1:
473 return string
474 string = res[0]
475 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000476 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000477 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000478 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000479 string += b'%' + item
480 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000481
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000482def unquote(string, encoding='utf-8', errors='replace'):
483 """Replace %xx escapes by their single-character equivalent. The optional
484 encoding and errors parameters specify how to decode percent-encoded
485 sequences into Unicode characters, as accepted by the bytes.decode()
486 method.
487 By default, percent-encoded sequences are decoded with UTF-8, and invalid
488 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000489
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000490 unquote('abc%20def') -> 'abc def'.
491 """
Florent Xiclunac049fca2010-07-31 08:56:55 +0000492 if string == '':
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000493 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000494 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000495 if len(res) == 1:
496 return string
497 if encoding is None:
498 encoding = 'utf-8'
499 if errors is None:
500 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000501 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000502 pct_sequence = b''
503 string = res[0]
504 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000505 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000506 if not item:
507 raise ValueError
508 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000509 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000510 if not rest:
511 # This segment was just a single percent-encoded character.
512 # May be part of a sequence of code units, so delay decoding.
513 # (Stored in pct_sequence).
514 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000515 except ValueError:
516 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000517 # Encountered non-percent-encoded characters. Flush the current
518 # pct_sequence.
519 string += pct_sequence.decode(encoding, errors) + rest
520 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000521 if pct_sequence:
522 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000523 string += pct_sequence.decode(encoding, errors)
524 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000525
Georg Brandl3d6575d2009-09-16 14:36:22 +0000526def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000527 """Parse a query given as a string argument.
528
529 Arguments:
530
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000531 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000532
533 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000534 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000535 A true value indicates that blanks should be retained as
536 blank strings. The default false value indicates that
537 blank values are to be ignored and treated as if they were
538 not included.
539
540 strict_parsing: flag indicating what to do with parsing errors.
541 If false (the default), errors are silently ignored.
542 If true, errors raise a ValueError exception.
543 """
544 dict = {}
545 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
546 if name in dict:
547 dict[name].append(value)
548 else:
549 dict[name] = [value]
550 return dict
551
Georg Brandl3d6575d2009-09-16 14:36:22 +0000552def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000553 """Parse a query given as a string argument.
554
555 Arguments:
556
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000557 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000558
559 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000560 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000561 true value indicates that blanks should be retained as blank
562 strings. The default false value indicates that blank values
563 are to be ignored and treated as if they were not included.
564
565 strict_parsing: flag indicating what to do with parsing errors. If
566 false (the default), errors are silently ignored. If true,
567 errors raise a ValueError exception.
568
569 Returns a list, as G-d intended.
570 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000571 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000572 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
573 r = []
574 for name_value in pairs:
575 if not name_value and not strict_parsing:
576 continue
577 nv = name_value.split('=', 1)
578 if len(nv) != 2:
579 if strict_parsing:
580 raise ValueError("bad query field: %r" % (name_value,))
581 # Handle case of a control-name with no equal sign
582 if keep_blank_values:
583 nv.append('')
584 else:
585 continue
586 if len(nv[1]) or keep_blank_values:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000587 name = _coerce_result(unquote(nv[0].replace('+', ' ')))
588 value = _coerce_result(unquote(nv[1].replace('+', ' ')))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000589 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000590 return r
591
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000592def unquote_plus(string, encoding='utf-8', errors='replace'):
593 """Like unquote(), but also replace plus signs by spaces, as required for
594 unquoting HTML form values.
595
596 unquote_plus('%7e/abc+def') -> '~/abc def'
597 """
598 string = string.replace('+', ' ')
599 return unquote(string, encoding, errors)
600
601_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
602 b'abcdefghijklmnopqrstuvwxyz'
603 b'0123456789'
604 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000605_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
606_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000607
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000608class Quoter(collections.defaultdict):
609 """A mapping from bytes (in range(0,256)) to strings.
610
611 String values are percent-encoded byte values, unless the key < 128, and
612 in the "safe" set (either the specified safe set, or default set).
613 """
614 # Keeps a cache internally, using defaultdict, for efficiency (lookups
615 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000616 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000617 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000618 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000619
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000620 def __repr__(self):
621 # Without this, will just display as a defaultdict
622 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000623
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000624 def __missing__(self, b):
625 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000626 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000627 self[b] = res
628 return res
629
630def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000631 """quote('abc def') -> 'abc%20def'
632
633 Each part of a URL, e.g. the path info, the query, etc., has a
634 different set of reserved characters that must be quoted.
635
636 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
637 the following reserved characters.
638
639 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
640 "$" | ","
641
642 Each of these characters is reserved in some component of a URL,
643 but not necessarily in all of them.
644
645 By default, the quote function is intended for quoting the path
646 section of a URL. Thus, it will not encode '/'. This character
647 is reserved, but in typical usage the quote function is being
648 called on a path where the existing slash characters are used as
649 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000650
651 string and safe may be either str or bytes objects. encoding must
652 not be specified if string is a str.
653
654 The optional encoding and errors parameters specify how to deal with
655 non-ASCII characters, as accepted by the str.encode method.
656 By default, encoding='utf-8' (characters are encoded with UTF-8), and
657 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000658 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000659 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000660 if not string:
661 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000662 if encoding is None:
663 encoding = 'utf-8'
664 if errors is None:
665 errors = 'strict'
666 string = string.encode(encoding, errors)
667 else:
668 if encoding is not None:
669 raise TypeError("quote() doesn't support 'encoding' for bytes")
670 if errors is not None:
671 raise TypeError("quote() doesn't support 'errors' for bytes")
672 return quote_from_bytes(string, safe)
673
674def quote_plus(string, safe='', encoding=None, errors=None):
675 """Like quote(), but also replace ' ' with '+', as required for quoting
676 HTML form values. Plus signs in the original string are escaped unless
677 they are included in safe. It also does not have safe default to '/'.
678 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000679 # Check if ' ' in string, where string may either be a str or bytes. If
680 # there are no spaces, the regular quote will produce the right answer.
681 if ((isinstance(string, str) and ' ' not in string) or
682 (isinstance(string, bytes) and b' ' not in string)):
683 return quote(string, safe, encoding, errors)
684 if isinstance(safe, str):
685 space = ' '
686 else:
687 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000688 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000689 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000690
691def quote_from_bytes(bs, safe='/'):
692 """Like quote(), but accepts a bytes object rather than a str, and does
693 not perform string-to-bytes encoding. It always returns an ASCII string.
694 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
695 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000696 if not isinstance(bs, (bytes, bytearray)):
697 raise TypeError("quote_from_bytes() expected bytes")
698 if not bs:
699 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000700 if isinstance(safe, str):
701 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
702 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000703 else:
704 safe = bytes([c for c in safe if c < 128])
705 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
706 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000707 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000708 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000709 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000710 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
711 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000712
Senthil Kumarandf022da2010-07-03 17:48:22 +0000713def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000714 """Encode a sequence of two-element tuples or dictionary into a URL query string.
715
716 If any values in the query arg are sequences and doseq is true, each
717 sequence element is converted to a separate parameter.
718
719 If the query arg is a sequence of two-element tuples, the order of the
720 parameters in the output will match the order of parameters in the
721 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000722
723 The query arg may be either a string or a bytes type. When query arg is a
724 string, the safe, encoding and error parameters are sent the quote_plus for
725 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000726 """
727
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000728 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000729 query = query.items()
730 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000731 # It's a bother at times that strings and string-like objects are
732 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000733 try:
734 # non-sequence items should not work with len()
735 # non-empty strings will fail this
736 if len(query) and not isinstance(query[0], tuple):
737 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000738 # Zero-length sequences of all types will get here and succeed,
739 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000740 # allowed empty dicts that type of behavior probably should be
741 # preserved for consistency
742 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000743 ty, va, tb = sys.exc_info()
744 raise TypeError("not a valid non-string sequence "
745 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000746
747 l = []
748 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000749 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000750 if isinstance(k, bytes):
751 k = quote_plus(k, safe)
752 else:
753 k = quote_plus(str(k), safe, encoding, errors)
754
755 if isinstance(v, bytes):
756 v = quote_plus(v, safe)
757 else:
758 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000759 l.append(k + '=' + v)
760 else:
761 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000762 if isinstance(k, bytes):
763 k = quote_plus(k, safe)
764 else:
765 k = quote_plus(str(k), safe, encoding, errors)
766
767 if isinstance(v, bytes):
768 v = quote_plus(v, safe)
769 l.append(k + '=' + v)
770 elif isinstance(v, str):
771 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000772 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000773 else:
774 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000775 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000776 x = len(v)
777 except TypeError:
778 # not a sequence
Senthil Kumarandf022da2010-07-03 17:48:22 +0000779 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000780 l.append(k + '=' + v)
781 else:
782 # loop over the sequence
783 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000784 if isinstance(elt, bytes):
785 elt = quote_plus(elt, safe)
786 else:
787 elt = quote_plus(str(elt), safe, encoding, errors)
788 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000789 return '&'.join(l)
790
791# Utilities to parse URLs (most of these return None for missing parts):
792# unwrap('<URL:type://host/path>') --> 'type://host/path'
793# splittype('type:opaquestring') --> 'type', 'opaquestring'
794# splithost('//host[:port]/path') --> 'host[:port]', '/path'
795# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
796# splitpasswd('user:passwd') -> 'user', 'passwd'
797# splitport('host:port') --> 'host', 'port'
798# splitquery('/path?query') --> '/path', 'query'
799# splittag('/path#tag') --> '/path', 'tag'
800# splitattr('/path;attr1=value1;attr2=value2;...') ->
801# '/path', ['attr1=value1', 'attr2=value2', ...]
802# splitvalue('attr=value') --> 'attr', 'value'
803# urllib.parse.unquote('abc%20def') -> 'abc def'
804# quote('abc def') -> 'abc%20def')
805
Georg Brandl13e89462008-07-01 19:56:00 +0000806def to_bytes(url):
807 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000808 # Most URL schemes require ASCII. If that changes, the conversion
809 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000810 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000811 if isinstance(url, str):
812 try:
813 url = url.encode("ASCII").decode()
814 except UnicodeError:
815 raise UnicodeError("URL " + repr(url) +
816 " contains non-ASCII characters")
817 return url
818
819def unwrap(url):
820 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
821 url = str(url).strip()
822 if url[:1] == '<' and url[-1:] == '>':
823 url = url[1:-1].strip()
824 if url[:4] == 'URL:': url = url[4:].strip()
825 return url
826
827_typeprog = None
828def splittype(url):
829 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
830 global _typeprog
831 if _typeprog is None:
832 import re
833 _typeprog = re.compile('^([^/:]+):')
834
835 match = _typeprog.match(url)
836 if match:
837 scheme = match.group(1)
838 return scheme.lower(), url[len(scheme) + 1:]
839 return None, url
840
841_hostprog = None
842def splithost(url):
843 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
844 global _hostprog
845 if _hostprog is None:
846 import re
847 _hostprog = re.compile('^//([^/?]*)(.*)$')
848
849 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000850 if match:
851 host_port = match.group(1)
852 path = match.group(2)
853 if path and not path.startswith('/'):
854 path = '/' + path
855 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000856 return None, url
857
858_userprog = None
859def splituser(host):
860 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
861 global _userprog
862 if _userprog is None:
863 import re
864 _userprog = re.compile('^(.*)@(.*)$')
865
866 match = _userprog.match(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +0000867 if match: return match.group(1, 2)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000868 return None, host
869
870_passwdprog = None
871def splitpasswd(user):
872 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
873 global _passwdprog
874 if _passwdprog is None:
875 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000876 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000877
878 match = _passwdprog.match(user)
879 if match: return match.group(1, 2)
880 return user, None
881
882# splittag('/path#tag') --> '/path', 'tag'
883_portprog = None
884def splitport(host):
885 """splitport('host:port') --> 'host', 'port'."""
886 global _portprog
887 if _portprog is None:
888 import re
889 _portprog = re.compile('^(.*):([0-9]+)$')
890
891 match = _portprog.match(host)
892 if match: return match.group(1, 2)
893 return host, None
894
895_nportprog = None
896def splitnport(host, defport=-1):
897 """Split host and port, returning numeric port.
898 Return given default port if no ':' found; defaults to -1.
899 Return numerical port if a valid number are found after ':'.
900 Return None if ':' but not a valid number."""
901 global _nportprog
902 if _nportprog is None:
903 import re
904 _nportprog = re.compile('^(.*):(.*)$')
905
906 match = _nportprog.match(host)
907 if match:
908 host, port = match.group(1, 2)
909 try:
910 if not port: raise ValueError("no digits")
911 nport = int(port)
912 except ValueError:
913 nport = None
914 return host, nport
915 return host, defport
916
917_queryprog = None
918def splitquery(url):
919 """splitquery('/path?query') --> '/path', 'query'."""
920 global _queryprog
921 if _queryprog is None:
922 import re
923 _queryprog = re.compile('^(.*)\?([^?]*)$')
924
925 match = _queryprog.match(url)
926 if match: return match.group(1, 2)
927 return url, None
928
929_tagprog = None
930def splittag(url):
931 """splittag('/path#tag') --> '/path', 'tag'."""
932 global _tagprog
933 if _tagprog is None:
934 import re
935 _tagprog = re.compile('^(.*)#([^#]*)$')
936
937 match = _tagprog.match(url)
938 if match: return match.group(1, 2)
939 return url, None
940
941def splitattr(url):
942 """splitattr('/path;attr1=value1;attr2=value2;...') ->
943 '/path', ['attr1=value1', 'attr2=value2', ...]."""
944 words = url.split(';')
945 return words[0], words[1:]
946
947_valueprog = None
948def splitvalue(attr):
949 """splitvalue('attr=value') --> 'attr', 'value'."""
950 global _valueprog
951 if _valueprog is None:
952 import re
953 _valueprog = re.compile('^([^=]*)=(.*)$')
954
955 match = _valueprog.match(attr)
956 if match: return match.group(1, 2)
957 return attr, None