blob: 38efd502ab17ca4cee815137b126b60eb6a4ecae [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000034 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
35 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000036 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
41 'prospero', 'rtsp', 'rtspu', '', 'sftp']
42uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
43 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
44 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000045 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
47 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
48uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
49 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
50 'mms', '', 'sftp']
51uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
52 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
53uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
54 'nntp', 'wais', 'https', 'shttp', 'snews',
55 'file', 'prospero', '']
56
57# Characters valid in scheme names
58scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
59 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
60 '0123456789'
61 '+-.')
62
63MAX_CACHE_SIZE = 20
64_parse_cache = {}
65
66def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000067 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000069 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000070
71
72class ResultMixin(object):
73 """Shared methods for the parsed result objects."""
74
75 @property
76 def username(self):
77 netloc = self.netloc
78 if "@" in netloc:
79 userinfo = netloc.rsplit("@", 1)[0]
80 if ":" in userinfo:
81 userinfo = userinfo.split(":", 1)[0]
82 return userinfo
83 return None
84
85 @property
86 def password(self):
87 netloc = self.netloc
88 if "@" in netloc:
89 userinfo = netloc.rsplit("@", 1)[0]
90 if ":" in userinfo:
91 return userinfo.split(":", 1)[1]
92 return None
93
94 @property
95 def hostname(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +000096 netloc = self.netloc.split('@')[-1]
97 if '[' in netloc and ']' in netloc:
98 return netloc.split(']')[0][1:].lower()
Senthil Kumaranad02d232010-04-16 03:02:13 +000099 elif ':' in netloc:
100 return netloc.split(':')[0].lower()
101 elif netloc == '':
102 return None
103 else:
104 return netloc.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106 @property
107 def port(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +0000108 netloc = self.netloc.split('@')[-1].split(']')[-1]
109 if ':' in netloc:
110 port = netloc.split(':')[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000111 return int(port, 10)
Senthil Kumaranad02d232010-04-16 03:02:13 +0000112 else:
113 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000114
115from collections import namedtuple
116
117class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
118
119 __slots__ = ()
120
121 def geturl(self):
122 return urlunsplit(self)
123
124
125class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
126
127 __slots__ = ()
128
129 def geturl(self):
130 return urlunparse(self)
131
132
133def urlparse(url, scheme='', allow_fragments=True):
134 """Parse a URL into 6 components:
135 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
136 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
139 tuple = urlsplit(url, scheme, allow_fragments)
140 scheme, netloc, url, query, fragment = tuple
141 if scheme in uses_params and ';' in url:
142 url, params = _splitparams(url)
143 else:
144 params = ''
145 return ParseResult(scheme, netloc, url, params, query, fragment)
146
147def _splitparams(url):
148 if '/' in url:
149 i = url.find(';', url.rfind('/'))
150 if i < 0:
151 return url, ''
152 else:
153 i = url.find(';')
154 return url[:i], url[i+1:]
155
156def _splitnetloc(url, start=0):
157 delim = len(url) # position of end of domain part of url, default is end
158 for c in '/?#': # look for delimiters; the order is NOT important
159 wdelim = url.find(c, start) # find first of this delim
160 if wdelim >= 0: # if found
161 delim = min(delim, wdelim) # use earliest delim position
162 return url[start:delim], url[delim:] # return (domain, rest)
163
164def urlsplit(url, scheme='', allow_fragments=True):
165 """Parse a URL into 5 components:
166 <scheme>://<netloc>/<path>?<query>#<fragment>
167 Return a 5-tuple: (scheme, netloc, path, query, fragment).
168 Note that we don't break the components up in smaller bits
169 (e.g. netloc is a single string) and we don't expand % escapes."""
170 allow_fragments = bool(allow_fragments)
171 key = url, scheme, allow_fragments, type(url), type(scheme)
172 cached = _parse_cache.get(key, None)
173 if cached:
174 return cached
175 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
176 clear_cache()
177 netloc = query = fragment = ''
178 i = url.find(':')
179 if i > 0:
180 if url[:i] == 'http': # optimize the common case
181 scheme = url[:i].lower()
182 url = url[i+1:]
183 if url[:2] == '//':
184 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000185 if (('[' in netloc and ']' not in netloc) or
186 (']' in netloc and '[' not in netloc)):
187 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 if allow_fragments and '#' in url:
189 url, fragment = url.split('#', 1)
190 if '?' in url:
191 url, query = url.split('?', 1)
192 v = SplitResult(scheme, netloc, url, query, fragment)
193 _parse_cache[key] = v
194 return v
Senthil Kumaran84c7d9f2010-08-04 04:50:44 +0000195 if url.endswith(':') or not url[i+1].isdigit():
196 for c in url[:i]:
197 if c not in scheme_chars:
198 break
199 else:
200 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000201 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000202 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000203 if (('[' in netloc and ']' not in netloc) or
204 (']' in netloc and '[' not in netloc)):
205 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000206 if allow_fragments and scheme in uses_fragment and '#' in url:
207 url, fragment = url.split('#', 1)
208 if scheme in uses_query and '?' in url:
209 url, query = url.split('?', 1)
210 v = SplitResult(scheme, netloc, url, query, fragment)
211 _parse_cache[key] = v
212 return v
213
214def urlunparse(components):
215 """Put a parsed URL back together again. This may result in a
216 slightly different, but equivalent URL, if the URL that was parsed
217 originally had redundant delimiters, e.g. a ? with an empty query
218 (the draft states that these are equivalent)."""
219 scheme, netloc, url, params, query, fragment = components
220 if params:
221 url = "%s;%s" % (url, params)
222 return urlunsplit((scheme, netloc, url, query, fragment))
223
224def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000225 """Combine the elements of a tuple as returned by urlsplit() into a
226 complete URL as a string. The data argument can be any five-item iterable.
227 This may result in a slightly different, but equivalent URL, if the URL that
228 was parsed originally had unnecessary delimiters (for example, a ? with an
229 empty query; the RFC states that these are equivalent)."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000230 scheme, netloc, url, query, fragment = components
231 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
232 if url and url[:1] != '/': url = '/' + url
233 url = '//' + (netloc or '') + url
234 if scheme:
235 url = scheme + ':' + url
236 if query:
237 url = url + '?' + query
238 if fragment:
239 url = url + '#' + fragment
240 return url
241
242def urljoin(base, url, allow_fragments=True):
243 """Join a base URL and a possibly relative URL to form an absolute
244 interpretation of the latter."""
245 if not base:
246 return url
247 if not url:
248 return base
249 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
250 urlparse(base, '', allow_fragments)
251 scheme, netloc, path, params, query, fragment = \
252 urlparse(url, bscheme, allow_fragments)
253 if scheme != bscheme or scheme not in uses_relative:
254 return url
255 if scheme in uses_netloc:
256 if netloc:
257 return urlunparse((scheme, netloc, path,
258 params, query, fragment))
259 netloc = bnetloc
260 if path[:1] == '/':
261 return urlunparse((scheme, netloc, path,
262 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000263 if not path:
264 path = bpath
265 if not params:
266 params = bparams
267 else:
268 path = path[:-1]
269 return urlunparse((scheme, netloc, path,
270 params, query, fragment))
271 if not query:
272 query = bquery
273 return urlunparse((scheme, netloc, path,
274 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000275 segments = bpath.split('/')[:-1] + path.split('/')
276 # XXX The stuff below is bogus in various ways...
277 if segments[-1] == '.':
278 segments[-1] = ''
279 while '.' in segments:
280 segments.remove('.')
281 while 1:
282 i = 1
283 n = len(segments) - 1
284 while i < n:
285 if (segments[i] == '..'
286 and segments[i-1] not in ('', '..')):
287 del segments[i-1:i+1]
288 break
289 i = i+1
290 else:
291 break
292 if segments == ['', '..']:
293 segments[-1] = ''
294 elif len(segments) >= 2 and segments[-1] == '..':
295 segments[-2:] = ['']
296 return urlunparse((scheme, netloc, '/'.join(segments),
297 params, query, fragment))
298
299def urldefrag(url):
300 """Removes any existing fragment from URL.
301
302 Returns a tuple of the defragmented URL and the fragment. If
303 the URL contained no fragments, the second element is the
304 empty string.
305 """
306 if '#' in url:
307 s, n, p, a, q, frag = urlparse(url)
308 defrag = urlunparse((s, n, p, a, q, ''))
309 return defrag, frag
310 else:
311 return url, ''
312
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000313def unquote_to_bytes(string):
314 """unquote_to_bytes('abc%20def') -> b'abc def'."""
315 # Note: strings are encoded as UTF-8. This is only an issue if it contains
316 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000317 if not string:
318 # Is it a string-like object?
319 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000320 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000321 if isinstance(string, str):
322 string = string.encode('utf-8')
323 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000324 if len(res) == 1:
325 return string
326 string = res[0]
327 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000328 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000329 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000330 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000331 string += b'%' + item
332 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000333
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000334def unquote(string, encoding='utf-8', errors='replace'):
335 """Replace %xx escapes by their single-character equivalent. The optional
336 encoding and errors parameters specify how to decode percent-encoded
337 sequences into Unicode characters, as accepted by the bytes.decode()
338 method.
339 By default, percent-encoded sequences are decoded with UTF-8, and invalid
340 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000341
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000342 unquote('abc%20def') -> 'abc def'.
343 """
Florent Xiclunac049fca2010-07-31 08:56:55 +0000344 if string == '':
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000345 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000346 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000347 if len(res) == 1:
348 return string
349 if encoding is None:
350 encoding = 'utf-8'
351 if errors is None:
352 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000353 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000354 pct_sequence = b''
355 string = res[0]
356 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000357 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000358 if not item:
359 raise ValueError
360 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000361 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000362 if not rest:
363 # This segment was just a single percent-encoded character.
364 # May be part of a sequence of code units, so delay decoding.
365 # (Stored in pct_sequence).
366 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000367 except ValueError:
368 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000369 # Encountered non-percent-encoded characters. Flush the current
370 # pct_sequence.
371 string += pct_sequence.decode(encoding, errors) + rest
372 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000373 if pct_sequence:
374 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000375 string += pct_sequence.decode(encoding, errors)
376 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000377
Georg Brandl3d6575d2009-09-16 14:36:22 +0000378def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000379 """Parse a query given as a string argument.
380
381 Arguments:
382
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000383 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000384
385 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000386 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000387 A true value indicates that blanks should be retained as
388 blank strings. The default false value indicates that
389 blank values are to be ignored and treated as if they were
390 not included.
391
392 strict_parsing: flag indicating what to do with parsing errors.
393 If false (the default), errors are silently ignored.
394 If true, errors raise a ValueError exception.
395 """
396 dict = {}
397 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
398 if name in dict:
399 dict[name].append(value)
400 else:
401 dict[name] = [value]
402 return dict
403
Georg Brandl3d6575d2009-09-16 14:36:22 +0000404def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000405 """Parse a query given as a string argument.
406
407 Arguments:
408
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000409 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000410
411 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000412 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000413 true value indicates that blanks should be retained as blank
414 strings. The default false value indicates that blank values
415 are to be ignored and treated as if they were not included.
416
417 strict_parsing: flag indicating what to do with parsing errors. If
418 false (the default), errors are silently ignored. If true,
419 errors raise a ValueError exception.
420
421 Returns a list, as G-d intended.
422 """
423 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
424 r = []
425 for name_value in pairs:
426 if not name_value and not strict_parsing:
427 continue
428 nv = name_value.split('=', 1)
429 if len(nv) != 2:
430 if strict_parsing:
431 raise ValueError("bad query field: %r" % (name_value,))
432 # Handle case of a control-name with no equal sign
433 if keep_blank_values:
434 nv.append('')
435 else:
436 continue
437 if len(nv[1]) or keep_blank_values:
438 name = unquote(nv[0].replace('+', ' '))
439 value = unquote(nv[1].replace('+', ' '))
440 r.append((name, value))
441
442 return r
443
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000444def unquote_plus(string, encoding='utf-8', errors='replace'):
445 """Like unquote(), but also replace plus signs by spaces, as required for
446 unquoting HTML form values.
447
448 unquote_plus('%7e/abc+def') -> '~/abc def'
449 """
450 string = string.replace('+', ' ')
451 return unquote(string, encoding, errors)
452
453_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
454 b'abcdefghijklmnopqrstuvwxyz'
455 b'0123456789'
456 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000457_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
458_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000459
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000460class Quoter(collections.defaultdict):
461 """A mapping from bytes (in range(0,256)) to strings.
462
463 String values are percent-encoded byte values, unless the key < 128, and
464 in the "safe" set (either the specified safe set, or default set).
465 """
466 # Keeps a cache internally, using defaultdict, for efficiency (lookups
467 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000468 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000469 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000470 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000471
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000472 def __repr__(self):
473 # Without this, will just display as a defaultdict
474 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000475
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000476 def __missing__(self, b):
477 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000478 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000479 self[b] = res
480 return res
481
482def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000483 """quote('abc def') -> 'abc%20def'
484
485 Each part of a URL, e.g. the path info, the query, etc., has a
486 different set of reserved characters that must be quoted.
487
488 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
489 the following reserved characters.
490
491 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
492 "$" | ","
493
494 Each of these characters is reserved in some component of a URL,
495 but not necessarily in all of them.
496
497 By default, the quote function is intended for quoting the path
498 section of a URL. Thus, it will not encode '/'. This character
499 is reserved, but in typical usage the quote function is being
500 called on a path where the existing slash characters are used as
501 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000502
503 string and safe may be either str or bytes objects. encoding must
504 not be specified if string is a str.
505
506 The optional encoding and errors parameters specify how to deal with
507 non-ASCII characters, as accepted by the str.encode method.
508 By default, encoding='utf-8' (characters are encoded with UTF-8), and
509 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000510 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000511 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000512 if not string:
513 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000514 if encoding is None:
515 encoding = 'utf-8'
516 if errors is None:
517 errors = 'strict'
518 string = string.encode(encoding, errors)
519 else:
520 if encoding is not None:
521 raise TypeError("quote() doesn't support 'encoding' for bytes")
522 if errors is not None:
523 raise TypeError("quote() doesn't support 'errors' for bytes")
524 return quote_from_bytes(string, safe)
525
526def quote_plus(string, safe='', encoding=None, errors=None):
527 """Like quote(), but also replace ' ' with '+', as required for quoting
528 HTML form values. Plus signs in the original string are escaped unless
529 they are included in safe. It also does not have safe default to '/'.
530 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000531 # Check if ' ' in string, where string may either be a str or bytes. If
532 # there are no spaces, the regular quote will produce the right answer.
533 if ((isinstance(string, str) and ' ' not in string) or
534 (isinstance(string, bytes) and b' ' not in string)):
535 return quote(string, safe, encoding, errors)
536 if isinstance(safe, str):
537 space = ' '
538 else:
539 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000540 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000541 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000542
543def quote_from_bytes(bs, safe='/'):
544 """Like quote(), but accepts a bytes object rather than a str, and does
545 not perform string-to-bytes encoding. It always returns an ASCII string.
546 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
547 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000548 if not isinstance(bs, (bytes, bytearray)):
549 raise TypeError("quote_from_bytes() expected bytes")
550 if not bs:
551 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000552 if isinstance(safe, str):
553 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
554 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000555 else:
556 safe = bytes([c for c in safe if c < 128])
557 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
558 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000559 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000560 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000561 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000562 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
563 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000564
Senthil Kumarandf022da2010-07-03 17:48:22 +0000565def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000566 """Encode a sequence of two-element tuples or dictionary into a URL query string.
567
568 If any values in the query arg are sequences and doseq is true, each
569 sequence element is converted to a separate parameter.
570
571 If the query arg is a sequence of two-element tuples, the order of the
572 parameters in the output will match the order of parameters in the
573 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000574
575 The query arg may be either a string or a bytes type. When query arg is a
576 string, the safe, encoding and error parameters are sent the quote_plus for
577 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000578 """
579
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000580 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000581 query = query.items()
582 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000583 # It's a bother at times that strings and string-like objects are
584 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000585 try:
586 # non-sequence items should not work with len()
587 # non-empty strings will fail this
588 if len(query) and not isinstance(query[0], tuple):
589 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000590 # Zero-length sequences of all types will get here and succeed,
591 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000592 # allowed empty dicts that type of behavior probably should be
593 # preserved for consistency
594 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000595 ty, va, tb = sys.exc_info()
596 raise TypeError("not a valid non-string sequence "
597 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000598
599 l = []
600 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000601 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000602 if isinstance(k, bytes):
603 k = quote_plus(k, safe)
604 else:
605 k = quote_plus(str(k), safe, encoding, errors)
606
607 if isinstance(v, bytes):
608 v = quote_plus(v, safe)
609 else:
610 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000611 l.append(k + '=' + v)
612 else:
613 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000614 if isinstance(k, bytes):
615 k = quote_plus(k, safe)
616 else:
617 k = quote_plus(str(k), safe, encoding, errors)
618
619 if isinstance(v, bytes):
620 v = quote_plus(v, safe)
621 l.append(k + '=' + v)
622 elif isinstance(v, str):
623 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000624 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000625 else:
626 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000627 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000628 x = len(v)
629 except TypeError:
630 # not a sequence
Senthil Kumarandf022da2010-07-03 17:48:22 +0000631 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000632 l.append(k + '=' + v)
633 else:
634 # loop over the sequence
635 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000636 if isinstance(elt, bytes):
637 elt = quote_plus(elt, safe)
638 else:
639 elt = quote_plus(str(elt), safe, encoding, errors)
640 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000641 return '&'.join(l)
642
643# Utilities to parse URLs (most of these return None for missing parts):
644# unwrap('<URL:type://host/path>') --> 'type://host/path'
645# splittype('type:opaquestring') --> 'type', 'opaquestring'
646# splithost('//host[:port]/path') --> 'host[:port]', '/path'
647# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
648# splitpasswd('user:passwd') -> 'user', 'passwd'
649# splitport('host:port') --> 'host', 'port'
650# splitquery('/path?query') --> '/path', 'query'
651# splittag('/path#tag') --> '/path', 'tag'
652# splitattr('/path;attr1=value1;attr2=value2;...') ->
653# '/path', ['attr1=value1', 'attr2=value2', ...]
654# splitvalue('attr=value') --> 'attr', 'value'
655# urllib.parse.unquote('abc%20def') -> 'abc def'
656# quote('abc def') -> 'abc%20def')
657
Georg Brandl13e89462008-07-01 19:56:00 +0000658def to_bytes(url):
659 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000660 # Most URL schemes require ASCII. If that changes, the conversion
661 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000662 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000663 if isinstance(url, str):
664 try:
665 url = url.encode("ASCII").decode()
666 except UnicodeError:
667 raise UnicodeError("URL " + repr(url) +
668 " contains non-ASCII characters")
669 return url
670
671def unwrap(url):
672 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
673 url = str(url).strip()
674 if url[:1] == '<' and url[-1:] == '>':
675 url = url[1:-1].strip()
676 if url[:4] == 'URL:': url = url[4:].strip()
677 return url
678
679_typeprog = None
680def splittype(url):
681 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
682 global _typeprog
683 if _typeprog is None:
684 import re
685 _typeprog = re.compile('^([^/:]+):')
686
687 match = _typeprog.match(url)
688 if match:
689 scheme = match.group(1)
690 return scheme.lower(), url[len(scheme) + 1:]
691 return None, url
692
693_hostprog = None
694def splithost(url):
695 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
696 global _hostprog
697 if _hostprog is None:
698 import re
699 _hostprog = re.compile('^//([^/?]*)(.*)$')
700
701 match = _hostprog.match(url)
702 if match: return match.group(1, 2)
703 return None, url
704
705_userprog = None
706def splituser(host):
707 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
708 global _userprog
709 if _userprog is None:
710 import re
711 _userprog = re.compile('^(.*)@(.*)$')
712
713 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000714 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000715 return None, host
716
717_passwdprog = None
718def splitpasswd(user):
719 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
720 global _passwdprog
721 if _passwdprog is None:
722 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000723 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000724
725 match = _passwdprog.match(user)
726 if match: return match.group(1, 2)
727 return user, None
728
729# splittag('/path#tag') --> '/path', 'tag'
730_portprog = None
731def splitport(host):
732 """splitport('host:port') --> 'host', 'port'."""
733 global _portprog
734 if _portprog is None:
735 import re
736 _portprog = re.compile('^(.*):([0-9]+)$')
737
738 match = _portprog.match(host)
739 if match: return match.group(1, 2)
740 return host, None
741
742_nportprog = None
743def splitnport(host, defport=-1):
744 """Split host and port, returning numeric port.
745 Return given default port if no ':' found; defaults to -1.
746 Return numerical port if a valid number are found after ':'.
747 Return None if ':' but not a valid number."""
748 global _nportprog
749 if _nportprog is None:
750 import re
751 _nportprog = re.compile('^(.*):(.*)$')
752
753 match = _nportprog.match(host)
754 if match:
755 host, port = match.group(1, 2)
756 try:
757 if not port: raise ValueError("no digits")
758 nport = int(port)
759 except ValueError:
760 nport = None
761 return host, nport
762 return host, defport
763
764_queryprog = None
765def splitquery(url):
766 """splitquery('/path?query') --> '/path', 'query'."""
767 global _queryprog
768 if _queryprog is None:
769 import re
770 _queryprog = re.compile('^(.*)\?([^?]*)$')
771
772 match = _queryprog.match(url)
773 if match: return match.group(1, 2)
774 return url, None
775
776_tagprog = None
777def splittag(url):
778 """splittag('/path#tag') --> '/path', 'tag'."""
779 global _tagprog
780 if _tagprog is None:
781 import re
782 _tagprog = re.compile('^(.*)#([^#]*)$')
783
784 match = _tagprog.match(url)
785 if match: return match.group(1, 2)
786 return url, None
787
788def splitattr(url):
789 """splitattr('/path;attr1=value1;attr2=value2;...') ->
790 '/path', ['attr1=value1', 'attr2=value2', ...]."""
791 words = url.split(';')
792 return words[0], words[1:]
793
794_valueprog = None
795def splitvalue(attr):
796 """splitvalue('attr=value') --> 'attr', 'value'."""
797 global _valueprog
798 if _valueprog is None:
799 import re
800 _valueprog = re.compile('^([^=]*)=(.*)$')
801
802 match = _valueprog.match(attr)
803 if match: return match.group(1, 2)
804 return attr, None