blob: b3494fa23a2fc390033b3a765122c2804cd1c59c [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
Georg Brandlc62efa82010-07-11 10:41:07 +00008RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00009Berners-Lee, R. Fielding, and L. Masinter, August 1998.
10
Georg Brandlf6c8fd62011-02-25 09:48:21 +000011RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000012
13RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
141995.
15
Georg Brandlc62efa82010-07-11 10:41:07 +000016RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000017McCahill, December 1994
18
Georg Brandlc62efa82010-07-11 10:41:07 +000019RFC 3986 is considered the current standard and any future changes to
20urlparse module should conform with it. The urlparse module is
21currently not entirely compliant with this RFC due to defacto
22scenarios for parsing, and for backward compatibility purposes, some
23parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000024test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000025"""
26
Facundo Batista2ac5de22008-07-07 18:24:11 +000027import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000028import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000029
Jeremy Hylton1afc1692008-06-18 20:49:58 +000030__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000031 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032 "quote", "quote_plus", "quote_from_bytes",
33 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034
35# A classification of schemes ('' means apply by default)
36uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
37 'wais', 'file', 'https', 'shttp', 'mms',
38 'prospero', 'rtsp', 'rtspu', '', 'sftp']
39uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
40 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
41 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xicluna37ddbb82010-08-14 21:06:29 +000042 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000043non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
44 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
45uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
46 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
47 'mms', '', 'sftp']
48uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
49 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
50uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
51 'nntp', 'wais', 'https', 'shttp', 'snews',
52 'file', 'prospero', '']
53
54# Characters valid in scheme names
55scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
56 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
57 '0123456789'
58 '+-.')
59
60MAX_CACHE_SIZE = 20
61_parse_cache = {}
62
63def clear_cache():
Florent Xicluna37ddbb82010-08-14 21:06:29 +000064 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000065 _parse_cache.clear()
Florent Xicluna37ddbb82010-08-14 21:06:29 +000066 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000067
68
69class ResultMixin(object):
70 """Shared methods for the parsed result objects."""
71
72 @property
73 def username(self):
74 netloc = self.netloc
75 if "@" in netloc:
76 userinfo = netloc.rsplit("@", 1)[0]
77 if ":" in userinfo:
78 userinfo = userinfo.split(":", 1)[0]
79 return userinfo
80 return None
81
82 @property
83 def password(self):
84 netloc = self.netloc
85 if "@" in netloc:
86 userinfo = netloc.rsplit("@", 1)[0]
87 if ":" in userinfo:
88 return userinfo.split(":", 1)[1]
89 return None
90
91 @property
92 def hostname(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +000093 netloc = self.netloc
94 if "@" in netloc:
95 netloc = netloc.rsplit("@", 1)[1]
96 if ":" in netloc:
97 netloc = netloc.split(":", 1)[0]
98 return netloc.lower() or None
Jeremy Hylton1afc1692008-06-18 20:49:58 +000099
100 @property
101 def port(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000102 netloc = self.netloc
103 if "@" in netloc:
104 netloc = netloc.rsplit("@", 1)[1]
105 if ":" in netloc:
106 port = netloc.split(":", 1)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000107 return int(port, 10)
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000108 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000109
110from collections import namedtuple
111
112class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
113
114 __slots__ = ()
115
116 def geturl(self):
117 return urlunsplit(self)
118
119
120class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
121
122 __slots__ = ()
123
124 def geturl(self):
125 return urlunparse(self)
126
127
128def urlparse(url, scheme='', allow_fragments=True):
129 """Parse a URL into 6 components:
130 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
131 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
132 Note that we don't break the components up in smaller bits
133 (e.g. netloc is a single string) and we don't expand % escapes."""
134 tuple = urlsplit(url, scheme, allow_fragments)
135 scheme, netloc, url, query, fragment = tuple
136 if scheme in uses_params and ';' in url:
137 url, params = _splitparams(url)
138 else:
139 params = ''
140 return ParseResult(scheme, netloc, url, params, query, fragment)
141
142def _splitparams(url):
143 if '/' in url:
144 i = url.find(';', url.rfind('/'))
145 if i < 0:
146 return url, ''
147 else:
148 i = url.find(';')
149 return url[:i], url[i+1:]
150
151def _splitnetloc(url, start=0):
152 delim = len(url) # position of end of domain part of url, default is end
153 for c in '/?#': # look for delimiters; the order is NOT important
154 wdelim = url.find(c, start) # find first of this delim
155 if wdelim >= 0: # if found
156 delim = min(delim, wdelim) # use earliest delim position
157 return url[start:delim], url[delim:] # return (domain, rest)
158
159def urlsplit(url, scheme='', allow_fragments=True):
160 """Parse a URL into 5 components:
161 <scheme>://<netloc>/<path>?<query>#<fragment>
162 Return a 5-tuple: (scheme, netloc, path, query, fragment).
163 Note that we don't break the components up in smaller bits
164 (e.g. netloc is a single string) and we don't expand % escapes."""
165 allow_fragments = bool(allow_fragments)
166 key = url, scheme, allow_fragments, type(url), type(scheme)
167 cached = _parse_cache.get(key, None)
168 if cached:
169 return cached
170 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
171 clear_cache()
172 netloc = query = fragment = ''
173 i = url.find(':')
174 if i > 0:
175 if url[:i] == 'http': # optimize the common case
176 scheme = url[:i].lower()
177 url = url[i+1:]
178 if url[:2] == '//':
179 netloc, url = _splitnetloc(url, 2)
180 if allow_fragments and '#' in url:
181 url, fragment = url.split('#', 1)
182 if '?' in url:
183 url, query = url.split('?', 1)
184 v = SplitResult(scheme, netloc, url, query, fragment)
185 _parse_cache[key] = v
186 return v
Senthil Kumaran8801f7a2010-08-04 04:53:07 +0000187 if url.endswith(':') or not url[i+1].isdigit():
188 for c in url[:i]:
189 if c not in scheme_chars:
190 break
191 else:
192 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumarana8dbb242010-02-19 07:45:03 +0000193 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000194 netloc, url = _splitnetloc(url, 2)
195 if allow_fragments and scheme in uses_fragment and '#' in url:
196 url, fragment = url.split('#', 1)
197 if scheme in uses_query and '?' in url:
198 url, query = url.split('?', 1)
199 v = SplitResult(scheme, netloc, url, query, fragment)
200 _parse_cache[key] = v
201 return v
202
203def urlunparse(components):
204 """Put a parsed URL back together again. This may result in a
205 slightly different, but equivalent URL, if the URL that was parsed
206 originally had redundant delimiters, e.g. a ? with an empty query
207 (the draft states that these are equivalent)."""
208 scheme, netloc, url, params, query, fragment = components
209 if params:
210 url = "%s;%s" % (url, params)
211 return urlunsplit((scheme, netloc, url, query, fragment))
212
213def urlunsplit(components):
Senthil Kumaran930049b2010-06-28 14:12:18 +0000214 """Combine the elements of a tuple as returned by urlsplit() into a
215 complete URL as a string. The data argument can be any five-item iterable.
216 This may result in a slightly different, but equivalent URL, if the URL that
217 was parsed originally had unnecessary delimiters (for example, a ? with an
218 empty query; the RFC states that these are equivalent)."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000219 scheme, netloc, url, query, fragment = components
220 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
221 if url and url[:1] != '/': url = '/' + url
222 url = '//' + (netloc or '') + url
223 if scheme:
224 url = scheme + ':' + url
225 if query:
226 url = url + '?' + query
227 if fragment:
228 url = url + '#' + fragment
229 return url
230
231def urljoin(base, url, allow_fragments=True):
232 """Join a base URL and a possibly relative URL to form an absolute
233 interpretation of the latter."""
234 if not base:
235 return url
236 if not url:
237 return base
238 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
239 urlparse(base, '', allow_fragments)
240 scheme, netloc, path, params, query, fragment = \
241 urlparse(url, bscheme, allow_fragments)
242 if scheme != bscheme or scheme not in uses_relative:
243 return url
244 if scheme in uses_netloc:
245 if netloc:
246 return urlunparse((scheme, netloc, path,
247 params, query, fragment))
248 netloc = bnetloc
249 if path[:1] == '/':
250 return urlunparse((scheme, netloc, path,
251 params, query, fragment))
Senthil Kumaran3396e862010-12-17 04:54:43 +0000252 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000253 path = bpath
Senthil Kumaran3396e862010-12-17 04:54:43 +0000254 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000255 if not query:
256 query = bquery
257 return urlunparse((scheme, netloc, path,
258 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000259 segments = bpath.split('/')[:-1] + path.split('/')
260 # XXX The stuff below is bogus in various ways...
261 if segments[-1] == '.':
262 segments[-1] = ''
263 while '.' in segments:
264 segments.remove('.')
265 while 1:
266 i = 1
267 n = len(segments) - 1
268 while i < n:
269 if (segments[i] == '..'
270 and segments[i-1] not in ('', '..')):
271 del segments[i-1:i+1]
272 break
273 i = i+1
274 else:
275 break
276 if segments == ['', '..']:
277 segments[-1] = ''
278 elif len(segments) >= 2 and segments[-1] == '..':
279 segments[-2:] = ['']
280 return urlunparse((scheme, netloc, '/'.join(segments),
281 params, query, fragment))
282
283def urldefrag(url):
284 """Removes any existing fragment from URL.
285
286 Returns a tuple of the defragmented URL and the fragment. If
287 the URL contained no fragments, the second element is the
288 empty string.
289 """
290 if '#' in url:
291 s, n, p, a, q, frag = urlparse(url)
292 defrag = urlunparse((s, n, p, a, q, ''))
293 return defrag, frag
294 else:
295 return url, ''
296
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000297def unquote_to_bytes(string):
298 """unquote_to_bytes('abc%20def') -> b'abc def'."""
299 # Note: strings are encoded as UTF-8. This is only an issue if it contains
300 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000301 if not string:
302 # Is it a string-like object?
303 string.split
304 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000305 if isinstance(string, str):
306 string = string.encode('utf-8')
307 res = string.split(b'%')
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000308 if len(res) == 1:
309 return string
310 string = res[0]
311 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000312 try:
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000313 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000314 except ValueError:
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000315 string += b'%' + item
316 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000317
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000318def unquote(string, encoding='utf-8', errors='replace'):
319 """Replace %xx escapes by their single-character equivalent. The optional
320 encoding and errors parameters specify how to decode percent-encoded
321 sequences into Unicode characters, as accepted by the bytes.decode()
322 method.
323 By default, percent-encoded sequences are decoded with UTF-8, and invalid
324 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000325
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000326 unquote('abc%20def') -> 'abc def'.
327 """
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000328 if string == '':
329 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000330 res = string.split('%')
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000331 if len(res) == 1:
332 return string
333 if encoding is None:
334 encoding = 'utf-8'
335 if errors is None:
336 errors = 'replace'
337 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
338 pct_sequence = b''
339 string = res[0]
340 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000341 try:
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000342 if not item:
343 raise ValueError
344 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000345 rest = item[2:]
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000346 if not rest:
347 # This segment was just a single percent-encoded character.
348 # May be part of a sequence of code units, so delay decoding.
349 # (Stored in pct_sequence).
350 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000351 except ValueError:
352 rest = '%' + item
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000353 # Encountered non-percent-encoded characters. Flush the current
354 # pct_sequence.
355 string += pct_sequence.decode(encoding, errors) + rest
356 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000357 if pct_sequence:
358 # Flush the final pct_sequence
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000359 string += pct_sequence.decode(encoding, errors)
360 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000361
Georg Brandlb044b2a2009-09-16 16:05:59 +0000362def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000363 """Parse a query given as a string argument.
364
365 Arguments:
366
Senthil Kumaran7a956cc2010-08-09 20:08:48 +0000367 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000368
369 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran7a956cc2010-08-09 20:08:48 +0000370 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000371 A true value indicates that blanks should be retained as
372 blank strings. The default false value indicates that
373 blank values are to be ignored and treated as if they were
374 not included.
375
376 strict_parsing: flag indicating what to do with parsing errors.
377 If false (the default), errors are silently ignored.
378 If true, errors raise a ValueError exception.
379 """
380 dict = {}
381 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
382 if name in dict:
383 dict[name].append(value)
384 else:
385 dict[name] = [value]
386 return dict
387
Georg Brandlb044b2a2009-09-16 16:05:59 +0000388def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000389 """Parse a query given as a string argument.
390
391 Arguments:
392
Senthil Kumaran7a956cc2010-08-09 20:08:48 +0000393 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000394
395 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran7a956cc2010-08-09 20:08:48 +0000396 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000397 true value indicates that blanks should be retained as blank
398 strings. The default false value indicates that blank values
399 are to be ignored and treated as if they were not included.
400
401 strict_parsing: flag indicating what to do with parsing errors. If
402 false (the default), errors are silently ignored. If true,
403 errors raise a ValueError exception.
404
405 Returns a list, as G-d intended.
406 """
407 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
408 r = []
409 for name_value in pairs:
410 if not name_value and not strict_parsing:
411 continue
412 nv = name_value.split('=', 1)
413 if len(nv) != 2:
414 if strict_parsing:
415 raise ValueError("bad query field: %r" % (name_value,))
416 # Handle case of a control-name with no equal sign
417 if keep_blank_values:
418 nv.append('')
419 else:
420 continue
421 if len(nv[1]) or keep_blank_values:
422 name = unquote(nv[0].replace('+', ' '))
423 value = unquote(nv[1].replace('+', ' '))
424 r.append((name, value))
425
426 return r
427
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000428def unquote_plus(string, encoding='utf-8', errors='replace'):
429 """Like unquote(), but also replace plus signs by spaces, as required for
430 unquoting HTML form values.
431
432 unquote_plus('%7e/abc+def') -> '~/abc def'
433 """
434 string = string.replace('+', ' ')
435 return unquote(string, encoding, errors)
436
437_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
438 b'abcdefghijklmnopqrstuvwxyz'
439 b'0123456789'
440 b'_.-')
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000441_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
442_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000443
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000444class Quoter(collections.defaultdict):
445 """A mapping from bytes (in range(0,256)) to strings.
446
447 String values are percent-encoded byte values, unless the key < 128, and
448 in the "safe" set (either the specified safe set, or default set).
449 """
450 # Keeps a cache internally, using defaultdict, for efficiency (lookups
451 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000452 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000453 """safe: bytes object."""
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000454 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000455
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000456 def __repr__(self):
457 # Without this, will just display as a defaultdict
458 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000459
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000460 def __missing__(self, b):
461 # Handle a cache miss. Store quoted string in cache and return.
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000462 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000463 self[b] = res
464 return res
465
466def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000467 """quote('abc def') -> 'abc%20def'
468
469 Each part of a URL, e.g. the path info, the query, etc., has a
470 different set of reserved characters that must be quoted.
471
472 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
473 the following reserved characters.
474
475 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
476 "$" | ","
477
478 Each of these characters is reserved in some component of a URL,
479 but not necessarily in all of them.
480
481 By default, the quote function is intended for quoting the path
482 section of a URL. Thus, it will not encode '/'. This character
483 is reserved, but in typical usage the quote function is being
484 called on a path where the existing slash characters are used as
485 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000486
487 string and safe may be either str or bytes objects. encoding must
488 not be specified if string is a str.
489
490 The optional encoding and errors parameters specify how to deal with
491 non-ASCII characters, as accepted by the str.encode method.
492 By default, encoding='utf-8' (characters are encoded with UTF-8), and
493 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000494 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000495 if isinstance(string, str):
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000496 if not string:
497 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000498 if encoding is None:
499 encoding = 'utf-8'
500 if errors is None:
501 errors = 'strict'
502 string = string.encode(encoding, errors)
503 else:
504 if encoding is not None:
505 raise TypeError("quote() doesn't support 'encoding' for bytes")
506 if errors is not None:
507 raise TypeError("quote() doesn't support 'errors' for bytes")
508 return quote_from_bytes(string, safe)
509
510def quote_plus(string, safe='', encoding=None, errors=None):
511 """Like quote(), but also replace ' ' with '+', as required for quoting
512 HTML form values. Plus signs in the original string are escaped unless
513 they are included in safe. It also does not have safe default to '/'.
514 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000515 # Check if ' ' in string, where string may either be a str or bytes. If
516 # there are no spaces, the regular quote will produce the right answer.
517 if ((isinstance(string, str) and ' ' not in string) or
518 (isinstance(string, bytes) and b' ' not in string)):
519 return quote(string, safe, encoding, errors)
520 if isinstance(safe, str):
521 space = ' '
522 else:
523 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000524 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000525 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000526
527def quote_from_bytes(bs, safe='/'):
528 """Like quote(), but accepts a bytes object rather than a str, and does
529 not perform string-to-bytes encoding. It always returns an ASCII string.
530 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
531 """
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000532 if not isinstance(bs, (bytes, bytearray)):
533 raise TypeError("quote_from_bytes() expected bytes")
534 if not bs:
535 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000536 if isinstance(safe, str):
537 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
538 safe = safe.encode('ascii', 'ignore')
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000539 else:
540 safe = bytes([c for c in safe if c < 128])
541 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
542 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000543 try:
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000544 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000545 except KeyError:
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000546 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
547 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000548
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000549def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000550 """Encode a sequence of two-element tuples or dictionary into a URL query string.
551
552 If any values in the query arg are sequences and doseq is true, each
553 sequence element is converted to a separate parameter.
554
555 If the query arg is a sequence of two-element tuples, the order of the
556 parameters in the output will match the order of parameters in the
557 input.
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000558
559 The query arg may be either a string or a bytes type. When query arg is a
560 string, the safe, encoding and error parameters are sent the quote_plus for
561 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000562 """
563
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000564 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000565 query = query.items()
566 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000567 # It's a bother at times that strings and string-like objects are
568 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000569 try:
570 # non-sequence items should not work with len()
571 # non-empty strings will fail this
572 if len(query) and not isinstance(query[0], tuple):
573 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000574 # Zero-length sequences of all types will get here and succeed,
575 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000576 # allowed empty dicts that type of behavior probably should be
577 # preserved for consistency
578 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000579 ty, va, tb = sys.exc_info()
580 raise TypeError("not a valid non-string sequence "
581 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000582
583 l = []
584 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000585 for k, v in query:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000586 if isinstance(k, bytes):
587 k = quote_plus(k, safe)
588 else:
589 k = quote_plus(str(k), safe, encoding, errors)
590
591 if isinstance(v, bytes):
592 v = quote_plus(v, safe)
593 else:
594 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000595 l.append(k + '=' + v)
596 else:
597 for k, v in query:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000598 if isinstance(k, bytes):
599 k = quote_plus(k, safe)
600 else:
601 k = quote_plus(str(k), safe, encoding, errors)
602
603 if isinstance(v, bytes):
604 v = quote_plus(v, safe)
605 l.append(k + '=' + v)
606 elif isinstance(v, str):
607 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000608 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000609 else:
610 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000611 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000612 x = len(v)
613 except TypeError:
614 # not a sequence
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000615 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000616 l.append(k + '=' + v)
617 else:
618 # loop over the sequence
619 for elt in v:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000620 if isinstance(elt, bytes):
621 elt = quote_plus(elt, safe)
622 else:
623 elt = quote_plus(str(elt), safe, encoding, errors)
624 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000625 return '&'.join(l)
626
627# Utilities to parse URLs (most of these return None for missing parts):
628# unwrap('<URL:type://host/path>') --> 'type://host/path'
629# splittype('type:opaquestring') --> 'type', 'opaquestring'
630# splithost('//host[:port]/path') --> 'host[:port]', '/path'
631# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
632# splitpasswd('user:passwd') -> 'user', 'passwd'
633# splitport('host:port') --> 'host', 'port'
634# splitquery('/path?query') --> '/path', 'query'
635# splittag('/path#tag') --> '/path', 'tag'
636# splitattr('/path;attr1=value1;attr2=value2;...') ->
637# '/path', ['attr1=value1', 'attr2=value2', ...]
638# splitvalue('attr=value') --> 'attr', 'value'
639# urllib.parse.unquote('abc%20def') -> 'abc def'
640# quote('abc def') -> 'abc%20def')
641
Georg Brandl13e89462008-07-01 19:56:00 +0000642def to_bytes(url):
643 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000644 # Most URL schemes require ASCII. If that changes, the conversion
645 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000646 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000647 if isinstance(url, str):
648 try:
649 url = url.encode("ASCII").decode()
650 except UnicodeError:
651 raise UnicodeError("URL " + repr(url) +
652 " contains non-ASCII characters")
653 return url
654
655def unwrap(url):
656 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
657 url = str(url).strip()
658 if url[:1] == '<' and url[-1:] == '>':
659 url = url[1:-1].strip()
660 if url[:4] == 'URL:': url = url[4:].strip()
661 return url
662
663_typeprog = None
664def splittype(url):
665 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
666 global _typeprog
667 if _typeprog is None:
668 import re
669 _typeprog = re.compile('^([^/:]+):')
670
671 match = _typeprog.match(url)
672 if match:
673 scheme = match.group(1)
674 return scheme.lower(), url[len(scheme) + 1:]
675 return None, url
676
677_hostprog = None
678def splithost(url):
679 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
680 global _hostprog
681 if _hostprog is None:
682 import re
683 _hostprog = re.compile('^//([^/?]*)(.*)$')
684
685 match = _hostprog.match(url)
Senthil Kumarand17ebdb2010-11-22 04:53:57 +0000686 if match:
687 host_port = match.group(1)
688 path = match.group(2)
689 if path and not path.startswith('/'):
690 path = '/' + path
691 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000692 return None, url
693
694_userprog = None
695def splituser(host):
696 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
697 global _userprog
698 if _userprog is None:
699 import re
700 _userprog = re.compile('^(.*)@(.*)$')
701
702 match = _userprog.match(host)
Senthil Kumaran723a7a62010-11-18 16:44:38 +0000703 if match: return match.group(1, 2)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000704 return None, host
705
706_passwdprog = None
707def splitpasswd(user):
708 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
709 global _passwdprog
710 if _passwdprog is None:
711 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000712 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000713
714 match = _passwdprog.match(user)
715 if match: return match.group(1, 2)
716 return user, None
717
718# splittag('/path#tag') --> '/path', 'tag'
719_portprog = None
720def splitport(host):
721 """splitport('host:port') --> 'host', 'port'."""
722 global _portprog
723 if _portprog is None:
724 import re
725 _portprog = re.compile('^(.*):([0-9]+)$')
726
727 match = _portprog.match(host)
728 if match: return match.group(1, 2)
729 return host, None
730
731_nportprog = None
732def splitnport(host, defport=-1):
733 """Split host and port, returning numeric port.
734 Return given default port if no ':' found; defaults to -1.
735 Return numerical port if a valid number are found after ':'.
736 Return None if ':' but not a valid number."""
737 global _nportprog
738 if _nportprog is None:
739 import re
740 _nportprog = re.compile('^(.*):(.*)$')
741
742 match = _nportprog.match(host)
743 if match:
744 host, port = match.group(1, 2)
745 try:
746 if not port: raise ValueError("no digits")
747 nport = int(port)
748 except ValueError:
749 nport = None
750 return host, nport
751 return host, defport
752
753_queryprog = None
754def splitquery(url):
755 """splitquery('/path?query') --> '/path', 'query'."""
756 global _queryprog
757 if _queryprog is None:
758 import re
759 _queryprog = re.compile('^(.*)\?([^?]*)$')
760
761 match = _queryprog.match(url)
762 if match: return match.group(1, 2)
763 return url, None
764
765_tagprog = None
766def splittag(url):
767 """splittag('/path#tag') --> '/path', 'tag'."""
768 global _tagprog
769 if _tagprog is None:
770 import re
771 _tagprog = re.compile('^(.*)#([^#]*)$')
772
773 match = _tagprog.match(url)
774 if match: return match.group(1, 2)
775 return url, None
776
777def splitattr(url):
778 """splitattr('/path;attr1=value1;attr2=value2;...') ->
779 '/path', ['attr1=value1', 'attr2=value2', ...]."""
780 words = url.split(';')
781 return words[0], words[1:]
782
783_valueprog = None
784def splitvalue(attr):
785 """splitvalue('attr=value') --> 'attr', 'value'."""
786 global _valueprog
787 if _valueprog is None:
788 import re
789 _valueprog = re.compile('^([^=]*)=(.*)$')
790
791 match = _valueprog.match(attr)
792 if match: return match.group(1, 2)
793 return attr, None