blob: 00f0e5bec49ee18beb5e0d094ec5d787af21ec09 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000034 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000035 "quote", "quote_plus", "quote_from_bytes",
36 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
41 'prospero', 'rtsp', 'rtspu', '', 'sftp']
42uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
43 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
44 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000045 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
47 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
48uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
49 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
50 'mms', '', 'sftp']
51uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
52 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
53uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
54 'nntp', 'wais', 'https', 'shttp', 'snews',
55 'file', 'prospero', '']
56
57# Characters valid in scheme names
58scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
59 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
60 '0123456789'
61 '+-.')
62
63MAX_CACHE_SIZE = 20
64_parse_cache = {}
65
66def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000067 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000069 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000070
71
72class ResultMixin(object):
73 """Shared methods for the parsed result objects."""
74
75 @property
76 def username(self):
77 netloc = self.netloc
78 if "@" in netloc:
79 userinfo = netloc.rsplit("@", 1)[0]
80 if ":" in userinfo:
81 userinfo = userinfo.split(":", 1)[0]
82 return userinfo
83 return None
84
85 @property
86 def password(self):
87 netloc = self.netloc
88 if "@" in netloc:
89 userinfo = netloc.rsplit("@", 1)[0]
90 if ":" in userinfo:
91 return userinfo.split(":", 1)[1]
92 return None
93
94 @property
95 def hostname(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +000096 netloc = self.netloc.split('@')[-1]
97 if '[' in netloc and ']' in netloc:
98 return netloc.split(']')[0][1:].lower()
Senthil Kumaranad02d232010-04-16 03:02:13 +000099 elif ':' in netloc:
100 return netloc.split(':')[0].lower()
101 elif netloc == '':
102 return None
103 else:
104 return netloc.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106 @property
107 def port(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +0000108 netloc = self.netloc.split('@')[-1].split(']')[-1]
109 if ':' in netloc:
110 port = netloc.split(':')[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000111 return int(port, 10)
Senthil Kumaranad02d232010-04-16 03:02:13 +0000112 else:
113 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000114
115from collections import namedtuple
116
117class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
118
119 __slots__ = ()
120
121 def geturl(self):
122 return urlunsplit(self)
123
124
125class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
126
127 __slots__ = ()
128
129 def geturl(self):
130 return urlunparse(self)
131
132
133def urlparse(url, scheme='', allow_fragments=True):
134 """Parse a URL into 6 components:
135 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
136 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
139 tuple = urlsplit(url, scheme, allow_fragments)
140 scheme, netloc, url, query, fragment = tuple
141 if scheme in uses_params and ';' in url:
142 url, params = _splitparams(url)
143 else:
144 params = ''
145 return ParseResult(scheme, netloc, url, params, query, fragment)
146
147def _splitparams(url):
148 if '/' in url:
149 i = url.find(';', url.rfind('/'))
150 if i < 0:
151 return url, ''
152 else:
153 i = url.find(';')
154 return url[:i], url[i+1:]
155
156def _splitnetloc(url, start=0):
157 delim = len(url) # position of end of domain part of url, default is end
158 for c in '/?#': # look for delimiters; the order is NOT important
159 wdelim = url.find(c, start) # find first of this delim
160 if wdelim >= 0: # if found
161 delim = min(delim, wdelim) # use earliest delim position
162 return url[start:delim], url[delim:] # return (domain, rest)
163
164def urlsplit(url, scheme='', allow_fragments=True):
165 """Parse a URL into 5 components:
166 <scheme>://<netloc>/<path>?<query>#<fragment>
167 Return a 5-tuple: (scheme, netloc, path, query, fragment).
168 Note that we don't break the components up in smaller bits
169 (e.g. netloc is a single string) and we don't expand % escapes."""
170 allow_fragments = bool(allow_fragments)
171 key = url, scheme, allow_fragments, type(url), type(scheme)
172 cached = _parse_cache.get(key, None)
173 if cached:
174 return cached
175 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
176 clear_cache()
177 netloc = query = fragment = ''
178 i = url.find(':')
179 if i > 0:
180 if url[:i] == 'http': # optimize the common case
181 scheme = url[:i].lower()
182 url = url[i+1:]
183 if url[:2] == '//':
184 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000185 if (('[' in netloc and ']' not in netloc) or
186 (']' in netloc and '[' not in netloc)):
187 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 if allow_fragments and '#' in url:
189 url, fragment = url.split('#', 1)
190 if '?' in url:
191 url, query = url.split('?', 1)
192 v = SplitResult(scheme, netloc, url, query, fragment)
193 _parse_cache[key] = v
194 return v
Senthil Kumaran84c7d9f2010-08-04 04:50:44 +0000195 if url.endswith(':') or not url[i+1].isdigit():
196 for c in url[:i]:
197 if c not in scheme_chars:
198 break
199 else:
200 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000201 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000202 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000203 if (('[' in netloc and ']' not in netloc) or
204 (']' in netloc and '[' not in netloc)):
205 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000206 if allow_fragments and scheme in uses_fragment and '#' in url:
207 url, fragment = url.split('#', 1)
208 if scheme in uses_query and '?' in url:
209 url, query = url.split('?', 1)
210 v = SplitResult(scheme, netloc, url, query, fragment)
211 _parse_cache[key] = v
212 return v
213
214def urlunparse(components):
215 """Put a parsed URL back together again. This may result in a
216 slightly different, but equivalent URL, if the URL that was parsed
217 originally had redundant delimiters, e.g. a ? with an empty query
218 (the draft states that these are equivalent)."""
219 scheme, netloc, url, params, query, fragment = components
220 if params:
221 url = "%s;%s" % (url, params)
222 return urlunsplit((scheme, netloc, url, query, fragment))
223
224def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000225 """Combine the elements of a tuple as returned by urlsplit() into a
226 complete URL as a string. The data argument can be any five-item iterable.
227 This may result in a slightly different, but equivalent URL, if the URL that
228 was parsed originally had unnecessary delimiters (for example, a ? with an
229 empty query; the RFC states that these are equivalent)."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000230 scheme, netloc, url, query, fragment = components
231 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
232 if url and url[:1] != '/': url = '/' + url
233 url = '//' + (netloc or '') + url
234 if scheme:
235 url = scheme + ':' + url
236 if query:
237 url = url + '?' + query
238 if fragment:
239 url = url + '#' + fragment
240 return url
241
242def urljoin(base, url, allow_fragments=True):
243 """Join a base URL and a possibly relative URL to form an absolute
244 interpretation of the latter."""
245 if not base:
246 return url
247 if not url:
248 return base
249 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
250 urlparse(base, '', allow_fragments)
251 scheme, netloc, path, params, query, fragment = \
252 urlparse(url, bscheme, allow_fragments)
253 if scheme != bscheme or scheme not in uses_relative:
254 return url
255 if scheme in uses_netloc:
256 if netloc:
257 return urlunparse((scheme, netloc, path,
258 params, query, fragment))
259 netloc = bnetloc
260 if path[:1] == '/':
261 return urlunparse((scheme, netloc, path,
262 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000263 if not path:
264 path = bpath
265 if not params:
266 params = bparams
267 else:
268 path = path[:-1]
269 return urlunparse((scheme, netloc, path,
270 params, query, fragment))
271 if not query:
272 query = bquery
273 return urlunparse((scheme, netloc, path,
274 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000275 segments = bpath.split('/')[:-1] + path.split('/')
276 # XXX The stuff below is bogus in various ways...
277 if segments[-1] == '.':
278 segments[-1] = ''
279 while '.' in segments:
280 segments.remove('.')
281 while 1:
282 i = 1
283 n = len(segments) - 1
284 while i < n:
285 if (segments[i] == '..'
286 and segments[i-1] not in ('', '..')):
287 del segments[i-1:i+1]
288 break
289 i = i+1
290 else:
291 break
292 if segments == ['', '..']:
293 segments[-1] = ''
294 elif len(segments) >= 2 and segments[-1] == '..':
295 segments[-2:] = ['']
296 return urlunparse((scheme, netloc, '/'.join(segments),
297 params, query, fragment))
298
299def urldefrag(url):
300 """Removes any existing fragment from URL.
301
302 Returns a tuple of the defragmented URL and the fragment. If
303 the URL contained no fragments, the second element is the
304 empty string.
305 """
306 if '#' in url:
307 s, n, p, a, q, frag = urlparse(url)
308 defrag = urlunparse((s, n, p, a, q, ''))
309 return defrag, frag
310 else:
311 return url, ''
312
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000313def unquote_to_bytes(string):
314 """unquote_to_bytes('abc%20def') -> b'abc def'."""
315 # Note: strings are encoded as UTF-8. This is only an issue if it contains
316 # unescaped non-ASCII characters, which URIs should not.
Senthil Kumarand496c4c2010-07-30 19:34:36 +0000317 if string in (b'', ''):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000318 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000319 if isinstance(string, str):
320 string = string.encode('utf-8')
321 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000322 if len(res) == 1:
323 return string
324 string = res[0]
325 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000326 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000327 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000328 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000329 string += b'%' + item
330 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000331
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000332def unquote(string, encoding='utf-8', errors='replace'):
333 """Replace %xx escapes by their single-character equivalent. The optional
334 encoding and errors parameters specify how to decode percent-encoded
335 sequences into Unicode characters, as accepted by the bytes.decode()
336 method.
337 By default, percent-encoded sequences are decoded with UTF-8, and invalid
338 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000339
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000340 unquote('abc%20def') -> 'abc def'.
341 """
Florent Xiclunac049fca2010-07-31 08:56:55 +0000342 if string == '':
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000343 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000344 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000345 if len(res) == 1:
346 return string
347 if encoding is None:
348 encoding = 'utf-8'
349 if errors is None:
350 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000351 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000352 pct_sequence = b''
353 string = res[0]
354 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000355 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000356 if not item:
357 raise ValueError
358 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000359 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000360 if not rest:
361 # This segment was just a single percent-encoded character.
362 # May be part of a sequence of code units, so delay decoding.
363 # (Stored in pct_sequence).
364 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000365 except ValueError:
366 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000367 # Encountered non-percent-encoded characters. Flush the current
368 # pct_sequence.
369 string += pct_sequence.decode(encoding, errors) + rest
370 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000371 if pct_sequence:
372 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000373 string += pct_sequence.decode(encoding, errors)
374 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000375
Georg Brandl3d6575d2009-09-16 14:36:22 +0000376def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000377 """Parse a query given as a string argument.
378
379 Arguments:
380
381 qs: URL-encoded query string to be parsed
382
383 keep_blank_values: flag indicating whether blank values in
384 URL encoded queries should be treated as blank strings.
385 A true value indicates that blanks should be retained as
386 blank strings. The default false value indicates that
387 blank values are to be ignored and treated as if they were
388 not included.
389
390 strict_parsing: flag indicating what to do with parsing errors.
391 If false (the default), errors are silently ignored.
392 If true, errors raise a ValueError exception.
393 """
394 dict = {}
395 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
396 if name in dict:
397 dict[name].append(value)
398 else:
399 dict[name] = [value]
400 return dict
401
Georg Brandl3d6575d2009-09-16 14:36:22 +0000402def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000403 """Parse a query given as a string argument.
404
405 Arguments:
406
407 qs: URL-encoded query string to be parsed
408
409 keep_blank_values: flag indicating whether blank values in
410 URL encoded queries should be treated as blank strings. A
411 true value indicates that blanks should be retained as blank
412 strings. The default false value indicates that blank values
413 are to be ignored and treated as if they were not included.
414
415 strict_parsing: flag indicating what to do with parsing errors. If
416 false (the default), errors are silently ignored. If true,
417 errors raise a ValueError exception.
418
419 Returns a list, as G-d intended.
420 """
421 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
422 r = []
423 for name_value in pairs:
424 if not name_value and not strict_parsing:
425 continue
426 nv = name_value.split('=', 1)
427 if len(nv) != 2:
428 if strict_parsing:
429 raise ValueError("bad query field: %r" % (name_value,))
430 # Handle case of a control-name with no equal sign
431 if keep_blank_values:
432 nv.append('')
433 else:
434 continue
435 if len(nv[1]) or keep_blank_values:
436 name = unquote(nv[0].replace('+', ' '))
437 value = unquote(nv[1].replace('+', ' '))
438 r.append((name, value))
439
440 return r
441
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000442def unquote_plus(string, encoding='utf-8', errors='replace'):
443 """Like unquote(), but also replace plus signs by spaces, as required for
444 unquoting HTML form values.
445
446 unquote_plus('%7e/abc+def') -> '~/abc def'
447 """
448 string = string.replace('+', ' ')
449 return unquote(string, encoding, errors)
450
451_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
452 b'abcdefghijklmnopqrstuvwxyz'
453 b'0123456789'
454 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000455_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
456_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000457
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000458class Quoter(collections.defaultdict):
459 """A mapping from bytes (in range(0,256)) to strings.
460
461 String values are percent-encoded byte values, unless the key < 128, and
462 in the "safe" set (either the specified safe set, or default set).
463 """
464 # Keeps a cache internally, using defaultdict, for efficiency (lookups
465 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000466 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000467 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000468 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000469
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000470 def __repr__(self):
471 # Without this, will just display as a defaultdict
472 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000473
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000474 def __missing__(self, b):
475 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000476 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000477 self[b] = res
478 return res
479
480def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000481 """quote('abc def') -> 'abc%20def'
482
483 Each part of a URL, e.g. the path info, the query, etc., has a
484 different set of reserved characters that must be quoted.
485
486 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
487 the following reserved characters.
488
489 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
490 "$" | ","
491
492 Each of these characters is reserved in some component of a URL,
493 but not necessarily in all of them.
494
495 By default, the quote function is intended for quoting the path
496 section of a URL. Thus, it will not encode '/'. This character
497 is reserved, but in typical usage the quote function is being
498 called on a path where the existing slash characters are used as
499 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000500
501 string and safe may be either str or bytes objects. encoding must
502 not be specified if string is a str.
503
504 The optional encoding and errors parameters specify how to deal with
505 non-ASCII characters, as accepted by the str.encode method.
506 By default, encoding='utf-8' (characters are encoded with UTF-8), and
507 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000508 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000509 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000510 if not string:
511 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000512 if encoding is None:
513 encoding = 'utf-8'
514 if errors is None:
515 errors = 'strict'
516 string = string.encode(encoding, errors)
517 else:
518 if encoding is not None:
519 raise TypeError("quote() doesn't support 'encoding' for bytes")
520 if errors is not None:
521 raise TypeError("quote() doesn't support 'errors' for bytes")
522 return quote_from_bytes(string, safe)
523
524def quote_plus(string, safe='', encoding=None, errors=None):
525 """Like quote(), but also replace ' ' with '+', as required for quoting
526 HTML form values. Plus signs in the original string are escaped unless
527 they are included in safe. It also does not have safe default to '/'.
528 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000529 # Check if ' ' in string, where string may either be a str or bytes. If
530 # there are no spaces, the regular quote will produce the right answer.
531 if ((isinstance(string, str) and ' ' not in string) or
532 (isinstance(string, bytes) and b' ' not in string)):
533 return quote(string, safe, encoding, errors)
534 if isinstance(safe, str):
535 space = ' '
536 else:
537 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000538 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000539 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000540
541def quote_from_bytes(bs, safe='/'):
542 """Like quote(), but accepts a bytes object rather than a str, and does
543 not perform string-to-bytes encoding. It always returns an ASCII string.
544 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
545 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000546 if not isinstance(bs, (bytes, bytearray)):
547 raise TypeError("quote_from_bytes() expected bytes")
548 if not bs:
549 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000550 if isinstance(safe, str):
551 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
552 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000553 else:
554 safe = bytes([c for c in safe if c < 128])
555 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
556 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000557 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000558 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000559 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000560 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
561 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000562
Senthil Kumarandf022da2010-07-03 17:48:22 +0000563def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000564 """Encode a sequence of two-element tuples or dictionary into a URL query string.
565
566 If any values in the query arg are sequences and doseq is true, each
567 sequence element is converted to a separate parameter.
568
569 If the query arg is a sequence of two-element tuples, the order of the
570 parameters in the output will match the order of parameters in the
571 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000572
573 The query arg may be either a string or a bytes type. When query arg is a
574 string, the safe, encoding and error parameters are sent the quote_plus for
575 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000576 """
577
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000578 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000579 query = query.items()
580 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000581 # It's a bother at times that strings and string-like objects are
582 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000583 try:
584 # non-sequence items should not work with len()
585 # non-empty strings will fail this
586 if len(query) and not isinstance(query[0], tuple):
587 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000588 # Zero-length sequences of all types will get here and succeed,
589 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000590 # allowed empty dicts that type of behavior probably should be
591 # preserved for consistency
592 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000593 ty, va, tb = sys.exc_info()
594 raise TypeError("not a valid non-string sequence "
595 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000596
597 l = []
598 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000599 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000600 if isinstance(k, bytes):
601 k = quote_plus(k, safe)
602 else:
603 k = quote_plus(str(k), safe, encoding, errors)
604
605 if isinstance(v, bytes):
606 v = quote_plus(v, safe)
607 else:
608 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000609 l.append(k + '=' + v)
610 else:
611 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000612 if isinstance(k, bytes):
613 k = quote_plus(k, safe)
614 else:
615 k = quote_plus(str(k), safe, encoding, errors)
616
617 if isinstance(v, bytes):
618 v = quote_plus(v, safe)
619 l.append(k + '=' + v)
620 elif isinstance(v, str):
621 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000622 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000623 else:
624 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000625 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000626 x = len(v)
627 except TypeError:
628 # not a sequence
Senthil Kumarandf022da2010-07-03 17:48:22 +0000629 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000630 l.append(k + '=' + v)
631 else:
632 # loop over the sequence
633 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000634 if isinstance(elt, bytes):
635 elt = quote_plus(elt, safe)
636 else:
637 elt = quote_plus(str(elt), safe, encoding, errors)
638 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000639 return '&'.join(l)
640
641# Utilities to parse URLs (most of these return None for missing parts):
642# unwrap('<URL:type://host/path>') --> 'type://host/path'
643# splittype('type:opaquestring') --> 'type', 'opaquestring'
644# splithost('//host[:port]/path') --> 'host[:port]', '/path'
645# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
646# splitpasswd('user:passwd') -> 'user', 'passwd'
647# splitport('host:port') --> 'host', 'port'
648# splitquery('/path?query') --> '/path', 'query'
649# splittag('/path#tag') --> '/path', 'tag'
650# splitattr('/path;attr1=value1;attr2=value2;...') ->
651# '/path', ['attr1=value1', 'attr2=value2', ...]
652# splitvalue('attr=value') --> 'attr', 'value'
653# urllib.parse.unquote('abc%20def') -> 'abc def'
654# quote('abc def') -> 'abc%20def')
655
Georg Brandl13e89462008-07-01 19:56:00 +0000656def to_bytes(url):
657 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000658 # Most URL schemes require ASCII. If that changes, the conversion
659 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000660 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000661 if isinstance(url, str):
662 try:
663 url = url.encode("ASCII").decode()
664 except UnicodeError:
665 raise UnicodeError("URL " + repr(url) +
666 " contains non-ASCII characters")
667 return url
668
669def unwrap(url):
670 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
671 url = str(url).strip()
672 if url[:1] == '<' and url[-1:] == '>':
673 url = url[1:-1].strip()
674 if url[:4] == 'URL:': url = url[4:].strip()
675 return url
676
677_typeprog = None
678def splittype(url):
679 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
680 global _typeprog
681 if _typeprog is None:
682 import re
683 _typeprog = re.compile('^([^/:]+):')
684
685 match = _typeprog.match(url)
686 if match:
687 scheme = match.group(1)
688 return scheme.lower(), url[len(scheme) + 1:]
689 return None, url
690
691_hostprog = None
692def splithost(url):
693 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
694 global _hostprog
695 if _hostprog is None:
696 import re
697 _hostprog = re.compile('^//([^/?]*)(.*)$')
698
699 match = _hostprog.match(url)
700 if match: return match.group(1, 2)
701 return None, url
702
703_userprog = None
704def splituser(host):
705 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
706 global _userprog
707 if _userprog is None:
708 import re
709 _userprog = re.compile('^(.*)@(.*)$')
710
711 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000712 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000713 return None, host
714
715_passwdprog = None
716def splitpasswd(user):
717 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
718 global _passwdprog
719 if _passwdprog is None:
720 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000721 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000722
723 match = _passwdprog.match(user)
724 if match: return match.group(1, 2)
725 return user, None
726
727# splittag('/path#tag') --> '/path', 'tag'
728_portprog = None
729def splitport(host):
730 """splitport('host:port') --> 'host', 'port'."""
731 global _portprog
732 if _portprog is None:
733 import re
734 _portprog = re.compile('^(.*):([0-9]+)$')
735
736 match = _portprog.match(host)
737 if match: return match.group(1, 2)
738 return host, None
739
740_nportprog = None
741def splitnport(host, defport=-1):
742 """Split host and port, returning numeric port.
743 Return given default port if no ':' found; defaults to -1.
744 Return numerical port if a valid number are found after ':'.
745 Return None if ':' but not a valid number."""
746 global _nportprog
747 if _nportprog is None:
748 import re
749 _nportprog = re.compile('^(.*):(.*)$')
750
751 match = _nportprog.match(host)
752 if match:
753 host, port = match.group(1, 2)
754 try:
755 if not port: raise ValueError("no digits")
756 nport = int(port)
757 except ValueError:
758 nport = None
759 return host, nport
760 return host, defport
761
762_queryprog = None
763def splitquery(url):
764 """splitquery('/path?query') --> '/path', 'query'."""
765 global _queryprog
766 if _queryprog is None:
767 import re
768 _queryprog = re.compile('^(.*)\?([^?]*)$')
769
770 match = _queryprog.match(url)
771 if match: return match.group(1, 2)
772 return url, None
773
774_tagprog = None
775def splittag(url):
776 """splittag('/path#tag') --> '/path', 'tag'."""
777 global _tagprog
778 if _tagprog is None:
779 import re
780 _tagprog = re.compile('^(.*)#([^#]*)$')
781
782 match = _tagprog.match(url)
783 if match: return match.group(1, 2)
784 return url, None
785
786def splitattr(url):
787 """splitattr('/path;attr1=value1;attr2=value2;...') ->
788 '/path', ['attr1=value1', 'attr2=value2', ...]."""
789 words = url.split(';')
790 return words[0], words[1:]
791
792_valueprog = None
793def splitvalue(attr):
794 """splitvalue('attr=value') --> 'attr', 'value'."""
795 global _valueprog
796 if _valueprog is None:
797 import re
798 _valueprog = re.compile('^([^=]*)=(.*)$')
799
800 match = _valueprog.match(attr)
801 if match: return match.group(1, 2)
802 return attr, None