blob: 691c004831a456e4222bd684f6142bfe88f412dc [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000034 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000035 "quote", "quote_plus", "quote_from_bytes",
36 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
41 'prospero', 'rtsp', 'rtspu', '', 'sftp']
42uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
43 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
44 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000045 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
47 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
48uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
49 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
50 'mms', '', 'sftp']
51uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
52 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
53uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
54 'nntp', 'wais', 'https', 'shttp', 'snews',
55 'file', 'prospero', '']
56
57# Characters valid in scheme names
58scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
59 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
60 '0123456789'
61 '+-.')
62
63MAX_CACHE_SIZE = 20
64_parse_cache = {}
65
66def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000067 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000069 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000070
71
72class ResultMixin(object):
73 """Shared methods for the parsed result objects."""
74
75 @property
76 def username(self):
77 netloc = self.netloc
78 if "@" in netloc:
79 userinfo = netloc.rsplit("@", 1)[0]
80 if ":" in userinfo:
81 userinfo = userinfo.split(":", 1)[0]
82 return userinfo
83 return None
84
85 @property
86 def password(self):
87 netloc = self.netloc
88 if "@" in netloc:
89 userinfo = netloc.rsplit("@", 1)[0]
90 if ":" in userinfo:
91 return userinfo.split(":", 1)[1]
92 return None
93
94 @property
95 def hostname(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +000096 netloc = self.netloc.split('@')[-1]
97 if '[' in netloc and ']' in netloc:
98 return netloc.split(']')[0][1:].lower()
Senthil Kumaranad02d232010-04-16 03:02:13 +000099 elif ':' in netloc:
100 return netloc.split(':')[0].lower()
101 elif netloc == '':
102 return None
103 else:
104 return netloc.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106 @property
107 def port(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +0000108 netloc = self.netloc.split('@')[-1].split(']')[-1]
109 if ':' in netloc:
110 port = netloc.split(':')[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000111 return int(port, 10)
Senthil Kumaranad02d232010-04-16 03:02:13 +0000112 else:
113 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000114
115from collections import namedtuple
116
117class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
118
119 __slots__ = ()
120
121 def geturl(self):
122 return urlunsplit(self)
123
124
125class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
126
127 __slots__ = ()
128
129 def geturl(self):
130 return urlunparse(self)
131
132
133def urlparse(url, scheme='', allow_fragments=True):
134 """Parse a URL into 6 components:
135 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
136 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
139 tuple = urlsplit(url, scheme, allow_fragments)
140 scheme, netloc, url, query, fragment = tuple
141 if scheme in uses_params and ';' in url:
142 url, params = _splitparams(url)
143 else:
144 params = ''
145 return ParseResult(scheme, netloc, url, params, query, fragment)
146
147def _splitparams(url):
148 if '/' in url:
149 i = url.find(';', url.rfind('/'))
150 if i < 0:
151 return url, ''
152 else:
153 i = url.find(';')
154 return url[:i], url[i+1:]
155
156def _splitnetloc(url, start=0):
157 delim = len(url) # position of end of domain part of url, default is end
158 for c in '/?#': # look for delimiters; the order is NOT important
159 wdelim = url.find(c, start) # find first of this delim
160 if wdelim >= 0: # if found
161 delim = min(delim, wdelim) # use earliest delim position
162 return url[start:delim], url[delim:] # return (domain, rest)
163
164def urlsplit(url, scheme='', allow_fragments=True):
165 """Parse a URL into 5 components:
166 <scheme>://<netloc>/<path>?<query>#<fragment>
167 Return a 5-tuple: (scheme, netloc, path, query, fragment).
168 Note that we don't break the components up in smaller bits
169 (e.g. netloc is a single string) and we don't expand % escapes."""
170 allow_fragments = bool(allow_fragments)
171 key = url, scheme, allow_fragments, type(url), type(scheme)
172 cached = _parse_cache.get(key, None)
173 if cached:
174 return cached
175 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
176 clear_cache()
177 netloc = query = fragment = ''
178 i = url.find(':')
179 if i > 0:
180 if url[:i] == 'http': # optimize the common case
181 scheme = url[:i].lower()
182 url = url[i+1:]
183 if url[:2] == '//':
184 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000185 if (('[' in netloc and ']' not in netloc) or
186 (']' in netloc and '[' not in netloc)):
187 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 if allow_fragments and '#' in url:
189 url, fragment = url.split('#', 1)
190 if '?' in url:
191 url, query = url.split('?', 1)
192 v = SplitResult(scheme, netloc, url, query, fragment)
193 _parse_cache[key] = v
194 return v
195 for c in url[:i]:
196 if c not in scheme_chars:
197 break
198 else:
199 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000200 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000201 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000202 if (('[' in netloc and ']' not in netloc) or
203 (']' in netloc and '[' not in netloc)):
204 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000205 if allow_fragments and scheme in uses_fragment and '#' in url:
206 url, fragment = url.split('#', 1)
207 if scheme in uses_query and '?' in url:
208 url, query = url.split('?', 1)
209 v = SplitResult(scheme, netloc, url, query, fragment)
210 _parse_cache[key] = v
211 return v
212
213def urlunparse(components):
214 """Put a parsed URL back together again. This may result in a
215 slightly different, but equivalent URL, if the URL that was parsed
216 originally had redundant delimiters, e.g. a ? with an empty query
217 (the draft states that these are equivalent)."""
218 scheme, netloc, url, params, query, fragment = components
219 if params:
220 url = "%s;%s" % (url, params)
221 return urlunsplit((scheme, netloc, url, query, fragment))
222
223def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000224 """Combine the elements of a tuple as returned by urlsplit() into a
225 complete URL as a string. The data argument can be any five-item iterable.
226 This may result in a slightly different, but equivalent URL, if the URL that
227 was parsed originally had unnecessary delimiters (for example, a ? with an
228 empty query; the RFC states that these are equivalent)."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000229 scheme, netloc, url, query, fragment = components
230 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
231 if url and url[:1] != '/': url = '/' + url
232 url = '//' + (netloc or '') + url
233 if scheme:
234 url = scheme + ':' + url
235 if query:
236 url = url + '?' + query
237 if fragment:
238 url = url + '#' + fragment
239 return url
240
241def urljoin(base, url, allow_fragments=True):
242 """Join a base URL and a possibly relative URL to form an absolute
243 interpretation of the latter."""
244 if not base:
245 return url
246 if not url:
247 return base
248 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
249 urlparse(base, '', allow_fragments)
250 scheme, netloc, path, params, query, fragment = \
251 urlparse(url, bscheme, allow_fragments)
252 if scheme != bscheme or scheme not in uses_relative:
253 return url
254 if scheme in uses_netloc:
255 if netloc:
256 return urlunparse((scheme, netloc, path,
257 params, query, fragment))
258 netloc = bnetloc
259 if path[:1] == '/':
260 return urlunparse((scheme, netloc, path,
261 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000262 if not path:
263 path = bpath
264 if not params:
265 params = bparams
266 else:
267 path = path[:-1]
268 return urlunparse((scheme, netloc, path,
269 params, query, fragment))
270 if not query:
271 query = bquery
272 return urlunparse((scheme, netloc, path,
273 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000274 segments = bpath.split('/')[:-1] + path.split('/')
275 # XXX The stuff below is bogus in various ways...
276 if segments[-1] == '.':
277 segments[-1] = ''
278 while '.' in segments:
279 segments.remove('.')
280 while 1:
281 i = 1
282 n = len(segments) - 1
283 while i < n:
284 if (segments[i] == '..'
285 and segments[i-1] not in ('', '..')):
286 del segments[i-1:i+1]
287 break
288 i = i+1
289 else:
290 break
291 if segments == ['', '..']:
292 segments[-1] = ''
293 elif len(segments) >= 2 and segments[-1] == '..':
294 segments[-2:] = ['']
295 return urlunparse((scheme, netloc, '/'.join(segments),
296 params, query, fragment))
297
298def urldefrag(url):
299 """Removes any existing fragment from URL.
300
301 Returns a tuple of the defragmented URL and the fragment. If
302 the URL contained no fragments, the second element is the
303 empty string.
304 """
305 if '#' in url:
306 s, n, p, a, q, frag = urlparse(url)
307 defrag = urlunparse((s, n, p, a, q, ''))
308 return defrag, frag
309 else:
310 return url, ''
311
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000312def unquote_to_bytes(string):
313 """unquote_to_bytes('abc%20def') -> b'abc def'."""
314 # Note: strings are encoded as UTF-8. This is only an issue if it contains
315 # unescaped non-ASCII characters, which URIs should not.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000316 if not string:
Senthil Kumaran79e17f62010-07-19 18:17:19 +0000317 if string is None:
318 raise TypeError('None object is invalid for unquote_to_bytes()')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000319 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000320 if isinstance(string, str):
321 string = string.encode('utf-8')
322 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000323 if len(res) == 1:
324 return string
325 string = res[0]
326 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000327 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000328 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000329 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000330 string += b'%' + item
331 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000332
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000333def unquote(string, encoding='utf-8', errors='replace'):
334 """Replace %xx escapes by their single-character equivalent. The optional
335 encoding and errors parameters specify how to decode percent-encoded
336 sequences into Unicode characters, as accepted by the bytes.decode()
337 method.
338 By default, percent-encoded sequences are decoded with UTF-8, and invalid
339 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000340
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000341 unquote('abc%20def') -> 'abc def'.
342 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000343 if not string:
Senthil Kumaran79e17f62010-07-19 18:17:19 +0000344 if string is None:
345 raise TypeError('None object is invalid for unquote() function.')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000346 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000347 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000348 if len(res) == 1:
349 return string
350 if encoding is None:
351 encoding = 'utf-8'
352 if errors is None:
353 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000354 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000355 pct_sequence = b''
356 string = res[0]
357 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000358 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000359 if not item:
360 raise ValueError
361 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000362 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000363 if not rest:
364 # This segment was just a single percent-encoded character.
365 # May be part of a sequence of code units, so delay decoding.
366 # (Stored in pct_sequence).
367 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000368 except ValueError:
369 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000370 # Encountered non-percent-encoded characters. Flush the current
371 # pct_sequence.
372 string += pct_sequence.decode(encoding, errors) + rest
373 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000374 if pct_sequence:
375 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000376 string += pct_sequence.decode(encoding, errors)
377 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000378
Georg Brandl3d6575d2009-09-16 14:36:22 +0000379def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000380 """Parse a query given as a string argument.
381
382 Arguments:
383
384 qs: URL-encoded query string to be parsed
385
386 keep_blank_values: flag indicating whether blank values in
387 URL encoded queries should be treated as blank strings.
388 A true value indicates that blanks should be retained as
389 blank strings. The default false value indicates that
390 blank values are to be ignored and treated as if they were
391 not included.
392
393 strict_parsing: flag indicating what to do with parsing errors.
394 If false (the default), errors are silently ignored.
395 If true, errors raise a ValueError exception.
396 """
397 dict = {}
398 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
399 if name in dict:
400 dict[name].append(value)
401 else:
402 dict[name] = [value]
403 return dict
404
Georg Brandl3d6575d2009-09-16 14:36:22 +0000405def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000406 """Parse a query given as a string argument.
407
408 Arguments:
409
410 qs: URL-encoded query string to be parsed
411
412 keep_blank_values: flag indicating whether blank values in
413 URL encoded queries should be treated as blank strings. A
414 true value indicates that blanks should be retained as blank
415 strings. The default false value indicates that blank values
416 are to be ignored and treated as if they were not included.
417
418 strict_parsing: flag indicating what to do with parsing errors. If
419 false (the default), errors are silently ignored. If true,
420 errors raise a ValueError exception.
421
422 Returns a list, as G-d intended.
423 """
424 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
425 r = []
426 for name_value in pairs:
427 if not name_value and not strict_parsing:
428 continue
429 nv = name_value.split('=', 1)
430 if len(nv) != 2:
431 if strict_parsing:
432 raise ValueError("bad query field: %r" % (name_value,))
433 # Handle case of a control-name with no equal sign
434 if keep_blank_values:
435 nv.append('')
436 else:
437 continue
438 if len(nv[1]) or keep_blank_values:
439 name = unquote(nv[0].replace('+', ' '))
440 value = unquote(nv[1].replace('+', ' '))
441 r.append((name, value))
442
443 return r
444
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000445def unquote_plus(string, encoding='utf-8', errors='replace'):
446 """Like unquote(), but also replace plus signs by spaces, as required for
447 unquoting HTML form values.
448
449 unquote_plus('%7e/abc+def') -> '~/abc def'
450 """
451 string = string.replace('+', ' ')
452 return unquote(string, encoding, errors)
453
454_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
455 b'abcdefghijklmnopqrstuvwxyz'
456 b'0123456789'
457 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000458_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
459_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000460
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000461class Quoter(collections.defaultdict):
462 """A mapping from bytes (in range(0,256)) to strings.
463
464 String values are percent-encoded byte values, unless the key < 128, and
465 in the "safe" set (either the specified safe set, or default set).
466 """
467 # Keeps a cache internally, using defaultdict, for efficiency (lookups
468 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000469 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000470 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000471 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000472
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000473 def __repr__(self):
474 # Without this, will just display as a defaultdict
475 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000476
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000477 def __missing__(self, b):
478 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000479 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000480 self[b] = res
481 return res
482
483def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000484 """quote('abc def') -> 'abc%20def'
485
486 Each part of a URL, e.g. the path info, the query, etc., has a
487 different set of reserved characters that must be quoted.
488
489 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
490 the following reserved characters.
491
492 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
493 "$" | ","
494
495 Each of these characters is reserved in some component of a URL,
496 but not necessarily in all of them.
497
498 By default, the quote function is intended for quoting the path
499 section of a URL. Thus, it will not encode '/'. This character
500 is reserved, but in typical usage the quote function is being
501 called on a path where the existing slash characters are used as
502 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000503
504 string and safe may be either str or bytes objects. encoding must
505 not be specified if string is a str.
506
507 The optional encoding and errors parameters specify how to deal with
508 non-ASCII characters, as accepted by the str.encode method.
509 By default, encoding='utf-8' (characters are encoded with UTF-8), and
510 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000511 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000512 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000513 if not string:
514 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000515 if encoding is None:
516 encoding = 'utf-8'
517 if errors is None:
518 errors = 'strict'
519 string = string.encode(encoding, errors)
520 else:
521 if encoding is not None:
522 raise TypeError("quote() doesn't support 'encoding' for bytes")
523 if errors is not None:
524 raise TypeError("quote() doesn't support 'errors' for bytes")
525 return quote_from_bytes(string, safe)
526
527def quote_plus(string, safe='', encoding=None, errors=None):
528 """Like quote(), but also replace ' ' with '+', as required for quoting
529 HTML form values. Plus signs in the original string are escaped unless
530 they are included in safe. It also does not have safe default to '/'.
531 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000532 # Check if ' ' in string, where string may either be a str or bytes. If
533 # there are no spaces, the regular quote will produce the right answer.
534 if ((isinstance(string, str) and ' ' not in string) or
535 (isinstance(string, bytes) and b' ' not in string)):
536 return quote(string, safe, encoding, errors)
537 if isinstance(safe, str):
538 space = ' '
539 else:
540 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000541 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000542 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000543
544def quote_from_bytes(bs, safe='/'):
545 """Like quote(), but accepts a bytes object rather than a str, and does
546 not perform string-to-bytes encoding. It always returns an ASCII string.
547 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
548 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000549 if not isinstance(bs, (bytes, bytearray)):
550 raise TypeError("quote_from_bytes() expected bytes")
551 if not bs:
552 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000553 if isinstance(safe, str):
554 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
555 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000556 else:
557 safe = bytes([c for c in safe if c < 128])
558 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
559 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000560 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000561 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000562 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000563 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
564 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000565
Senthil Kumarandf022da2010-07-03 17:48:22 +0000566def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000567 """Encode a sequence of two-element tuples or dictionary into a URL query string.
568
569 If any values in the query arg are sequences and doseq is true, each
570 sequence element is converted to a separate parameter.
571
572 If the query arg is a sequence of two-element tuples, the order of the
573 parameters in the output will match the order of parameters in the
574 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000575
576 The query arg may be either a string or a bytes type. When query arg is a
577 string, the safe, encoding and error parameters are sent the quote_plus for
578 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000579 """
580
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000581 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000582 query = query.items()
583 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000584 # It's a bother at times that strings and string-like objects are
585 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000586 try:
587 # non-sequence items should not work with len()
588 # non-empty strings will fail this
589 if len(query) and not isinstance(query[0], tuple):
590 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000591 # Zero-length sequences of all types will get here and succeed,
592 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000593 # allowed empty dicts that type of behavior probably should be
594 # preserved for consistency
595 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000596 ty, va, tb = sys.exc_info()
597 raise TypeError("not a valid non-string sequence "
598 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000599
600 l = []
601 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000602 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000603 if isinstance(k, bytes):
604 k = quote_plus(k, safe)
605 else:
606 k = quote_plus(str(k), safe, encoding, errors)
607
608 if isinstance(v, bytes):
609 v = quote_plus(v, safe)
610 else:
611 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000612 l.append(k + '=' + v)
613 else:
614 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000615 if isinstance(k, bytes):
616 k = quote_plus(k, safe)
617 else:
618 k = quote_plus(str(k), safe, encoding, errors)
619
620 if isinstance(v, bytes):
621 v = quote_plus(v, safe)
622 l.append(k + '=' + v)
623 elif isinstance(v, str):
624 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000625 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000626 else:
627 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000628 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000629 x = len(v)
630 except TypeError:
631 # not a sequence
Senthil Kumarandf022da2010-07-03 17:48:22 +0000632 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000633 l.append(k + '=' + v)
634 else:
635 # loop over the sequence
636 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000637 if isinstance(elt, bytes):
638 elt = quote_plus(elt, safe)
639 else:
640 elt = quote_plus(str(elt), safe, encoding, errors)
641 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000642 return '&'.join(l)
643
644# Utilities to parse URLs (most of these return None for missing parts):
645# unwrap('<URL:type://host/path>') --> 'type://host/path'
646# splittype('type:opaquestring') --> 'type', 'opaquestring'
647# splithost('//host[:port]/path') --> 'host[:port]', '/path'
648# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
649# splitpasswd('user:passwd') -> 'user', 'passwd'
650# splitport('host:port') --> 'host', 'port'
651# splitquery('/path?query') --> '/path', 'query'
652# splittag('/path#tag') --> '/path', 'tag'
653# splitattr('/path;attr1=value1;attr2=value2;...') ->
654# '/path', ['attr1=value1', 'attr2=value2', ...]
655# splitvalue('attr=value') --> 'attr', 'value'
656# urllib.parse.unquote('abc%20def') -> 'abc def'
657# quote('abc def') -> 'abc%20def')
658
Georg Brandl13e89462008-07-01 19:56:00 +0000659def to_bytes(url):
660 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000661 # Most URL schemes require ASCII. If that changes, the conversion
662 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000663 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000664 if isinstance(url, str):
665 try:
666 url = url.encode("ASCII").decode()
667 except UnicodeError:
668 raise UnicodeError("URL " + repr(url) +
669 " contains non-ASCII characters")
670 return url
671
672def unwrap(url):
673 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
674 url = str(url).strip()
675 if url[:1] == '<' and url[-1:] == '>':
676 url = url[1:-1].strip()
677 if url[:4] == 'URL:': url = url[4:].strip()
678 return url
679
680_typeprog = None
681def splittype(url):
682 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
683 global _typeprog
684 if _typeprog is None:
685 import re
686 _typeprog = re.compile('^([^/:]+):')
687
688 match = _typeprog.match(url)
689 if match:
690 scheme = match.group(1)
691 return scheme.lower(), url[len(scheme) + 1:]
692 return None, url
693
694_hostprog = None
695def splithost(url):
696 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
697 global _hostprog
698 if _hostprog is None:
699 import re
700 _hostprog = re.compile('^//([^/?]*)(.*)$')
701
702 match = _hostprog.match(url)
703 if match: return match.group(1, 2)
704 return None, url
705
706_userprog = None
707def splituser(host):
708 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
709 global _userprog
710 if _userprog is None:
711 import re
712 _userprog = re.compile('^(.*)@(.*)$')
713
714 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000715 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000716 return None, host
717
718_passwdprog = None
719def splitpasswd(user):
720 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
721 global _passwdprog
722 if _passwdprog is None:
723 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000724 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000725
726 match = _passwdprog.match(user)
727 if match: return match.group(1, 2)
728 return user, None
729
730# splittag('/path#tag') --> '/path', 'tag'
731_portprog = None
732def splitport(host):
733 """splitport('host:port') --> 'host', 'port'."""
734 global _portprog
735 if _portprog is None:
736 import re
737 _portprog = re.compile('^(.*):([0-9]+)$')
738
739 match = _portprog.match(host)
740 if match: return match.group(1, 2)
741 return host, None
742
743_nportprog = None
744def splitnport(host, defport=-1):
745 """Split host and port, returning numeric port.
746 Return given default port if no ':' found; defaults to -1.
747 Return numerical port if a valid number are found after ':'.
748 Return None if ':' but not a valid number."""
749 global _nportprog
750 if _nportprog is None:
751 import re
752 _nportprog = re.compile('^(.*):(.*)$')
753
754 match = _nportprog.match(host)
755 if match:
756 host, port = match.group(1, 2)
757 try:
758 if not port: raise ValueError("no digits")
759 nport = int(port)
760 except ValueError:
761 nport = None
762 return host, nport
763 return host, defport
764
765_queryprog = None
766def splitquery(url):
767 """splitquery('/path?query') --> '/path', 'query'."""
768 global _queryprog
769 if _queryprog is None:
770 import re
771 _queryprog = re.compile('^(.*)\?([^?]*)$')
772
773 match = _queryprog.match(url)
774 if match: return match.group(1, 2)
775 return url, None
776
777_tagprog = None
778def splittag(url):
779 """splittag('/path#tag') --> '/path', 'tag'."""
780 global _tagprog
781 if _tagprog is None:
782 import re
783 _tagprog = re.compile('^(.*)#([^#]*)$')
784
785 match = _tagprog.match(url)
786 if match: return match.group(1, 2)
787 return url, None
788
789def splitattr(url):
790 """splitattr('/path;attr1=value1;attr2=value2;...') ->
791 '/path', ['attr1=value1', 'attr2=value2', ...]."""
792 words = url.split(';')
793 return words[0], words[1:]
794
795_valueprog = None
796def splitvalue(attr):
797 """splitvalue('attr=value') --> 'attr', 'value'."""
798 global _valueprog
799 if _valueprog is None:
800 import re
801 _valueprog = re.compile('^([^=]*)=(.*)$')
802
803 match = _valueprog.match(attr)
804 if match: return match.group(1, 2)
805 return attr, None