blob: 765f1c8334e5cbe1fa877e29bd21c1562d47fc1d [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
Georg Brandlc62efa82010-07-11 10:41:07 +00008RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00009Berners-Lee, R. Fielding, and L. Masinter, August 1998.
10
Georg Brandlc62efa82010-07-11 10:41:07 +000011RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000012
13RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
141995.
15
Georg Brandlc62efa82010-07-11 10:41:07 +000016RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000017McCahill, December 1994
18
Georg Brandlc62efa82010-07-11 10:41:07 +000019RFC 3986 is considered the current standard and any future changes to
20urlparse module should conform with it. The urlparse module is
21currently not entirely compliant with this RFC due to defacto
22scenarios for parsing, and for backward compatibility purposes, some
23parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000024test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000025"""
26
Facundo Batista2ac5de22008-07-07 18:24:11 +000027import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000028import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000029
Jeremy Hylton1afc1692008-06-18 20:49:58 +000030__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000031 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032 "quote", "quote_plus", "quote_from_bytes",
33 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034
35# A classification of schemes ('' means apply by default)
36uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
37 'wais', 'file', 'https', 'shttp', 'mms',
38 'prospero', 'rtsp', 'rtspu', '', 'sftp']
39uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
40 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
41 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xicluna37ddbb82010-08-14 21:06:29 +000042 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000043non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
44 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
45uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
46 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
47 'mms', '', 'sftp']
48uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
49 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
50uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
51 'nntp', 'wais', 'https', 'shttp', 'snews',
52 'file', 'prospero', '']
53
54# Characters valid in scheme names
55scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
56 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
57 '0123456789'
58 '+-.')
59
60MAX_CACHE_SIZE = 20
61_parse_cache = {}
62
63def clear_cache():
Florent Xicluna37ddbb82010-08-14 21:06:29 +000064 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000065 _parse_cache.clear()
Florent Xicluna37ddbb82010-08-14 21:06:29 +000066 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000067
68
69class ResultMixin(object):
70 """Shared methods for the parsed result objects."""
71
72 @property
73 def username(self):
74 netloc = self.netloc
75 if "@" in netloc:
76 userinfo = netloc.rsplit("@", 1)[0]
77 if ":" in userinfo:
78 userinfo = userinfo.split(":", 1)[0]
79 return userinfo
80 return None
81
82 @property
83 def password(self):
84 netloc = self.netloc
85 if "@" in netloc:
86 userinfo = netloc.rsplit("@", 1)[0]
87 if ":" in userinfo:
88 return userinfo.split(":", 1)[1]
89 return None
90
91 @property
92 def hostname(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +000093 netloc = self.netloc
94 if "@" in netloc:
95 netloc = netloc.rsplit("@", 1)[1]
96 if ":" in netloc:
97 netloc = netloc.split(":", 1)[0]
98 return netloc.lower() or None
Jeremy Hylton1afc1692008-06-18 20:49:58 +000099
100 @property
101 def port(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000102 netloc = self.netloc
103 if "@" in netloc:
104 netloc = netloc.rsplit("@", 1)[1]
105 if ":" in netloc:
106 port = netloc.split(":", 1)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000107 return int(port, 10)
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000108 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000109
110from collections import namedtuple
111
112class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
113
114 __slots__ = ()
115
116 def geturl(self):
117 return urlunsplit(self)
118
119
120class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
121
122 __slots__ = ()
123
124 def geturl(self):
125 return urlunparse(self)
126
127
128def urlparse(url, scheme='', allow_fragments=True):
129 """Parse a URL into 6 components:
130 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
131 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
132 Note that we don't break the components up in smaller bits
133 (e.g. netloc is a single string) and we don't expand % escapes."""
134 tuple = urlsplit(url, scheme, allow_fragments)
135 scheme, netloc, url, query, fragment = tuple
136 if scheme in uses_params and ';' in url:
137 url, params = _splitparams(url)
138 else:
139 params = ''
140 return ParseResult(scheme, netloc, url, params, query, fragment)
141
142def _splitparams(url):
143 if '/' in url:
144 i = url.find(';', url.rfind('/'))
145 if i < 0:
146 return url, ''
147 else:
148 i = url.find(';')
149 return url[:i], url[i+1:]
150
151def _splitnetloc(url, start=0):
152 delim = len(url) # position of end of domain part of url, default is end
153 for c in '/?#': # look for delimiters; the order is NOT important
154 wdelim = url.find(c, start) # find first of this delim
155 if wdelim >= 0: # if found
156 delim = min(delim, wdelim) # use earliest delim position
157 return url[start:delim], url[delim:] # return (domain, rest)
158
159def urlsplit(url, scheme='', allow_fragments=True):
160 """Parse a URL into 5 components:
161 <scheme>://<netloc>/<path>?<query>#<fragment>
162 Return a 5-tuple: (scheme, netloc, path, query, fragment).
163 Note that we don't break the components up in smaller bits
164 (e.g. netloc is a single string) and we don't expand % escapes."""
165 allow_fragments = bool(allow_fragments)
166 key = url, scheme, allow_fragments, type(url), type(scheme)
167 cached = _parse_cache.get(key, None)
168 if cached:
169 return cached
170 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
171 clear_cache()
172 netloc = query = fragment = ''
173 i = url.find(':')
174 if i > 0:
175 if url[:i] == 'http': # optimize the common case
176 scheme = url[:i].lower()
177 url = url[i+1:]
178 if url[:2] == '//':
179 netloc, url = _splitnetloc(url, 2)
180 if allow_fragments and '#' in url:
181 url, fragment = url.split('#', 1)
182 if '?' in url:
183 url, query = url.split('?', 1)
184 v = SplitResult(scheme, netloc, url, query, fragment)
185 _parse_cache[key] = v
186 return v
Senthil Kumaran8801f7a2010-08-04 04:53:07 +0000187 if url.endswith(':') or not url[i+1].isdigit():
188 for c in url[:i]:
189 if c not in scheme_chars:
190 break
191 else:
192 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumarana8dbb242010-02-19 07:45:03 +0000193 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000194 netloc, url = _splitnetloc(url, 2)
195 if allow_fragments and scheme in uses_fragment and '#' in url:
196 url, fragment = url.split('#', 1)
197 if scheme in uses_query and '?' in url:
198 url, query = url.split('?', 1)
199 v = SplitResult(scheme, netloc, url, query, fragment)
200 _parse_cache[key] = v
201 return v
202
203def urlunparse(components):
204 """Put a parsed URL back together again. This may result in a
205 slightly different, but equivalent URL, if the URL that was parsed
206 originally had redundant delimiters, e.g. a ? with an empty query
207 (the draft states that these are equivalent)."""
208 scheme, netloc, url, params, query, fragment = components
209 if params:
210 url = "%s;%s" % (url, params)
211 return urlunsplit((scheme, netloc, url, query, fragment))
212
213def urlunsplit(components):
Senthil Kumaran930049b2010-06-28 14:12:18 +0000214 """Combine the elements of a tuple as returned by urlsplit() into a
215 complete URL as a string. The data argument can be any five-item iterable.
216 This may result in a slightly different, but equivalent URL, if the URL that
217 was parsed originally had unnecessary delimiters (for example, a ? with an
218 empty query; the RFC states that these are equivalent)."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000219 scheme, netloc, url, query, fragment = components
220 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
221 if url and url[:1] != '/': url = '/' + url
222 url = '//' + (netloc or '') + url
223 if scheme:
224 url = scheme + ':' + url
225 if query:
226 url = url + '?' + query
227 if fragment:
228 url = url + '#' + fragment
229 return url
230
231def urljoin(base, url, allow_fragments=True):
232 """Join a base URL and a possibly relative URL to form an absolute
233 interpretation of the latter."""
234 if not base:
235 return url
236 if not url:
237 return base
238 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
239 urlparse(base, '', allow_fragments)
240 scheme, netloc, path, params, query, fragment = \
241 urlparse(url, bscheme, allow_fragments)
242 if scheme != bscheme or scheme not in uses_relative:
243 return url
244 if scheme in uses_netloc:
245 if netloc:
246 return urlunparse((scheme, netloc, path,
247 params, query, fragment))
248 netloc = bnetloc
249 if path[:1] == '/':
250 return urlunparse((scheme, netloc, path,
251 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000252 if not path:
253 path = bpath
254 if not params:
255 params = bparams
256 else:
257 path = path[:-1]
258 return urlunparse((scheme, netloc, path,
259 params, query, fragment))
260 if not query:
261 query = bquery
262 return urlunparse((scheme, netloc, path,
263 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000264 segments = bpath.split('/')[:-1] + path.split('/')
265 # XXX The stuff below is bogus in various ways...
266 if segments[-1] == '.':
267 segments[-1] = ''
268 while '.' in segments:
269 segments.remove('.')
270 while 1:
271 i = 1
272 n = len(segments) - 1
273 while i < n:
274 if (segments[i] == '..'
275 and segments[i-1] not in ('', '..')):
276 del segments[i-1:i+1]
277 break
278 i = i+1
279 else:
280 break
281 if segments == ['', '..']:
282 segments[-1] = ''
283 elif len(segments) >= 2 and segments[-1] == '..':
284 segments[-2:] = ['']
285 return urlunparse((scheme, netloc, '/'.join(segments),
286 params, query, fragment))
287
288def urldefrag(url):
289 """Removes any existing fragment from URL.
290
291 Returns a tuple of the defragmented URL and the fragment. If
292 the URL contained no fragments, the second element is the
293 empty string.
294 """
295 if '#' in url:
296 s, n, p, a, q, frag = urlparse(url)
297 defrag = urlunparse((s, n, p, a, q, ''))
298 return defrag, frag
299 else:
300 return url, ''
301
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000302def unquote_to_bytes(string):
303 """unquote_to_bytes('abc%20def') -> b'abc def'."""
304 # Note: strings are encoded as UTF-8. This is only an issue if it contains
305 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000306 if not string:
307 # Is it a string-like object?
308 string.split
309 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000310 if isinstance(string, str):
311 string = string.encode('utf-8')
312 res = string.split(b'%')
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000313 if len(res) == 1:
314 return string
315 string = res[0]
316 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000317 try:
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000318 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000319 except ValueError:
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000320 string += b'%' + item
321 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000322
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000323def unquote(string, encoding='utf-8', errors='replace'):
324 """Replace %xx escapes by their single-character equivalent. The optional
325 encoding and errors parameters specify how to decode percent-encoded
326 sequences into Unicode characters, as accepted by the bytes.decode()
327 method.
328 By default, percent-encoded sequences are decoded with UTF-8, and invalid
329 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000331 unquote('abc%20def') -> 'abc def'.
332 """
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000333 if string == '':
334 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000335 res = string.split('%')
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000336 if len(res) == 1:
337 return string
338 if encoding is None:
339 encoding = 'utf-8'
340 if errors is None:
341 errors = 'replace'
342 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
343 pct_sequence = b''
344 string = res[0]
345 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000346 try:
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000347 if not item:
348 raise ValueError
349 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000350 rest = item[2:]
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000351 if not rest:
352 # This segment was just a single percent-encoded character.
353 # May be part of a sequence of code units, so delay decoding.
354 # (Stored in pct_sequence).
355 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000356 except ValueError:
357 rest = '%' + item
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000358 # Encountered non-percent-encoded characters. Flush the current
359 # pct_sequence.
360 string += pct_sequence.decode(encoding, errors) + rest
361 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000362 if pct_sequence:
363 # Flush the final pct_sequence
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000364 string += pct_sequence.decode(encoding, errors)
365 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000366
Georg Brandlb044b2a2009-09-16 16:05:59 +0000367def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000368 """Parse a query given as a string argument.
369
370 Arguments:
371
Senthil Kumaran7a956cc2010-08-09 20:08:48 +0000372 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000373
374 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran7a956cc2010-08-09 20:08:48 +0000375 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000376 A true value indicates that blanks should be retained as
377 blank strings. The default false value indicates that
378 blank values are to be ignored and treated as if they were
379 not included.
380
381 strict_parsing: flag indicating what to do with parsing errors.
382 If false (the default), errors are silently ignored.
383 If true, errors raise a ValueError exception.
384 """
385 dict = {}
386 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
387 if name in dict:
388 dict[name].append(value)
389 else:
390 dict[name] = [value]
391 return dict
392
Georg Brandlb044b2a2009-09-16 16:05:59 +0000393def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000394 """Parse a query given as a string argument.
395
396 Arguments:
397
Senthil Kumaran7a956cc2010-08-09 20:08:48 +0000398 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000399
400 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran7a956cc2010-08-09 20:08:48 +0000401 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000402 true value indicates that blanks should be retained as blank
403 strings. The default false value indicates that blank values
404 are to be ignored and treated as if they were not included.
405
406 strict_parsing: flag indicating what to do with parsing errors. If
407 false (the default), errors are silently ignored. If true,
408 errors raise a ValueError exception.
409
410 Returns a list, as G-d intended.
411 """
412 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
413 r = []
414 for name_value in pairs:
415 if not name_value and not strict_parsing:
416 continue
417 nv = name_value.split('=', 1)
418 if len(nv) != 2:
419 if strict_parsing:
420 raise ValueError("bad query field: %r" % (name_value,))
421 # Handle case of a control-name with no equal sign
422 if keep_blank_values:
423 nv.append('')
424 else:
425 continue
426 if len(nv[1]) or keep_blank_values:
427 name = unquote(nv[0].replace('+', ' '))
428 value = unquote(nv[1].replace('+', ' '))
429 r.append((name, value))
430
431 return r
432
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000433def unquote_plus(string, encoding='utf-8', errors='replace'):
434 """Like unquote(), but also replace plus signs by spaces, as required for
435 unquoting HTML form values.
436
437 unquote_plus('%7e/abc+def') -> '~/abc def'
438 """
439 string = string.replace('+', ' ')
440 return unquote(string, encoding, errors)
441
442_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
443 b'abcdefghijklmnopqrstuvwxyz'
444 b'0123456789'
445 b'_.-')
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000446_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
447_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000448
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000449class Quoter(collections.defaultdict):
450 """A mapping from bytes (in range(0,256)) to strings.
451
452 String values are percent-encoded byte values, unless the key < 128, and
453 in the "safe" set (either the specified safe set, or default set).
454 """
455 # Keeps a cache internally, using defaultdict, for efficiency (lookups
456 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000457 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000458 """safe: bytes object."""
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000459 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000460
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000461 def __repr__(self):
462 # Without this, will just display as a defaultdict
463 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000464
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000465 def __missing__(self, b):
466 # Handle a cache miss. Store quoted string in cache and return.
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000467 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000468 self[b] = res
469 return res
470
471def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000472 """quote('abc def') -> 'abc%20def'
473
474 Each part of a URL, e.g. the path info, the query, etc., has a
475 different set of reserved characters that must be quoted.
476
477 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
478 the following reserved characters.
479
480 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
481 "$" | ","
482
483 Each of these characters is reserved in some component of a URL,
484 but not necessarily in all of them.
485
486 By default, the quote function is intended for quoting the path
487 section of a URL. Thus, it will not encode '/'. This character
488 is reserved, but in typical usage the quote function is being
489 called on a path where the existing slash characters are used as
490 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000491
492 string and safe may be either str or bytes objects. encoding must
493 not be specified if string is a str.
494
495 The optional encoding and errors parameters specify how to deal with
496 non-ASCII characters, as accepted by the str.encode method.
497 By default, encoding='utf-8' (characters are encoded with UTF-8), and
498 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000499 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000500 if isinstance(string, str):
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000501 if not string:
502 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000503 if encoding is None:
504 encoding = 'utf-8'
505 if errors is None:
506 errors = 'strict'
507 string = string.encode(encoding, errors)
508 else:
509 if encoding is not None:
510 raise TypeError("quote() doesn't support 'encoding' for bytes")
511 if errors is not None:
512 raise TypeError("quote() doesn't support 'errors' for bytes")
513 return quote_from_bytes(string, safe)
514
515def quote_plus(string, safe='', encoding=None, errors=None):
516 """Like quote(), but also replace ' ' with '+', as required for quoting
517 HTML form values. Plus signs in the original string are escaped unless
518 they are included in safe. It also does not have safe default to '/'.
519 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000520 # Check if ' ' in string, where string may either be a str or bytes. If
521 # there are no spaces, the regular quote will produce the right answer.
522 if ((isinstance(string, str) and ' ' not in string) or
523 (isinstance(string, bytes) and b' ' not in string)):
524 return quote(string, safe, encoding, errors)
525 if isinstance(safe, str):
526 space = ' '
527 else:
528 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000529 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000530 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000531
532def quote_from_bytes(bs, safe='/'):
533 """Like quote(), but accepts a bytes object rather than a str, and does
534 not perform string-to-bytes encoding. It always returns an ASCII string.
535 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
536 """
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000537 if not isinstance(bs, (bytes, bytearray)):
538 raise TypeError("quote_from_bytes() expected bytes")
539 if not bs:
540 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000541 if isinstance(safe, str):
542 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
543 safe = safe.encode('ascii', 'ignore')
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000544 else:
545 safe = bytes([c for c in safe if c < 128])
546 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
547 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000548 try:
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000549 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000550 except KeyError:
Florent Xicluna37ddbb82010-08-14 21:06:29 +0000551 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
552 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000553
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000554def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000555 """Encode a sequence of two-element tuples or dictionary into a URL query string.
556
557 If any values in the query arg are sequences and doseq is true, each
558 sequence element is converted to a separate parameter.
559
560 If the query arg is a sequence of two-element tuples, the order of the
561 parameters in the output will match the order of parameters in the
562 input.
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000563
564 The query arg may be either a string or a bytes type. When query arg is a
565 string, the safe, encoding and error parameters are sent the quote_plus for
566 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000567 """
568
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000569 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000570 query = query.items()
571 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000572 # It's a bother at times that strings and string-like objects are
573 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000574 try:
575 # non-sequence items should not work with len()
576 # non-empty strings will fail this
577 if len(query) and not isinstance(query[0], tuple):
578 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000579 # Zero-length sequences of all types will get here and succeed,
580 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000581 # allowed empty dicts that type of behavior probably should be
582 # preserved for consistency
583 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000584 ty, va, tb = sys.exc_info()
585 raise TypeError("not a valid non-string sequence "
586 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000587
588 l = []
589 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000590 for k, v in query:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000591 if isinstance(k, bytes):
592 k = quote_plus(k, safe)
593 else:
594 k = quote_plus(str(k), safe, encoding, errors)
595
596 if isinstance(v, bytes):
597 v = quote_plus(v, safe)
598 else:
599 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000600 l.append(k + '=' + v)
601 else:
602 for k, v in query:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000603 if isinstance(k, bytes):
604 k = quote_plus(k, safe)
605 else:
606 k = quote_plus(str(k), safe, encoding, errors)
607
608 if isinstance(v, bytes):
609 v = quote_plus(v, safe)
610 l.append(k + '=' + v)
611 elif isinstance(v, str):
612 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000613 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000614 else:
615 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000616 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000617 x = len(v)
618 except TypeError:
619 # not a sequence
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000620 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000621 l.append(k + '=' + v)
622 else:
623 # loop over the sequence
624 for elt in v:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000625 if isinstance(elt, bytes):
626 elt = quote_plus(elt, safe)
627 else:
628 elt = quote_plus(str(elt), safe, encoding, errors)
629 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000630 return '&'.join(l)
631
632# Utilities to parse URLs (most of these return None for missing parts):
633# unwrap('<URL:type://host/path>') --> 'type://host/path'
634# splittype('type:opaquestring') --> 'type', 'opaquestring'
635# splithost('//host[:port]/path') --> 'host[:port]', '/path'
636# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
637# splitpasswd('user:passwd') -> 'user', 'passwd'
638# splitport('host:port') --> 'host', 'port'
639# splitquery('/path?query') --> '/path', 'query'
640# splittag('/path#tag') --> '/path', 'tag'
641# splitattr('/path;attr1=value1;attr2=value2;...') ->
642# '/path', ['attr1=value1', 'attr2=value2', ...]
643# splitvalue('attr=value') --> 'attr', 'value'
644# urllib.parse.unquote('abc%20def') -> 'abc def'
645# quote('abc def') -> 'abc%20def')
646
Georg Brandl13e89462008-07-01 19:56:00 +0000647def to_bytes(url):
648 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000649 # Most URL schemes require ASCII. If that changes, the conversion
650 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000651 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000652 if isinstance(url, str):
653 try:
654 url = url.encode("ASCII").decode()
655 except UnicodeError:
656 raise UnicodeError("URL " + repr(url) +
657 " contains non-ASCII characters")
658 return url
659
660def unwrap(url):
661 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
662 url = str(url).strip()
663 if url[:1] == '<' and url[-1:] == '>':
664 url = url[1:-1].strip()
665 if url[:4] == 'URL:': url = url[4:].strip()
666 return url
667
668_typeprog = None
669def splittype(url):
670 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
671 global _typeprog
672 if _typeprog is None:
673 import re
674 _typeprog = re.compile('^([^/:]+):')
675
676 match = _typeprog.match(url)
677 if match:
678 scheme = match.group(1)
679 return scheme.lower(), url[len(scheme) + 1:]
680 return None, url
681
682_hostprog = None
683def splithost(url):
684 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
685 global _hostprog
686 if _hostprog is None:
687 import re
688 _hostprog = re.compile('^//([^/?]*)(.*)$')
689
690 match = _hostprog.match(url)
691 if match: return match.group(1, 2)
692 return None, url
693
694_userprog = None
695def splituser(host):
696 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
697 global _userprog
698 if _userprog is None:
699 import re
700 _userprog = re.compile('^(.*)@(.*)$')
701
702 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000703 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000704 return None, host
705
706_passwdprog = None
707def splitpasswd(user):
708 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
709 global _passwdprog
710 if _passwdprog is None:
711 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000712 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000713
714 match = _passwdprog.match(user)
715 if match: return match.group(1, 2)
716 return user, None
717
718# splittag('/path#tag') --> '/path', 'tag'
719_portprog = None
720def splitport(host):
721 """splitport('host:port') --> 'host', 'port'."""
722 global _portprog
723 if _portprog is None:
724 import re
725 _portprog = re.compile('^(.*):([0-9]+)$')
726
727 match = _portprog.match(host)
728 if match: return match.group(1, 2)
729 return host, None
730
731_nportprog = None
732def splitnport(host, defport=-1):
733 """Split host and port, returning numeric port.
734 Return given default port if no ':' found; defaults to -1.
735 Return numerical port if a valid number are found after ':'.
736 Return None if ':' but not a valid number."""
737 global _nportprog
738 if _nportprog is None:
739 import re
740 _nportprog = re.compile('^(.*):(.*)$')
741
742 match = _nportprog.match(host)
743 if match:
744 host, port = match.group(1, 2)
745 try:
746 if not port: raise ValueError("no digits")
747 nport = int(port)
748 except ValueError:
749 nport = None
750 return host, nport
751 return host, defport
752
753_queryprog = None
754def splitquery(url):
755 """splitquery('/path?query') --> '/path', 'query'."""
756 global _queryprog
757 if _queryprog is None:
758 import re
759 _queryprog = re.compile('^(.*)\?([^?]*)$')
760
761 match = _queryprog.match(url)
762 if match: return match.group(1, 2)
763 return url, None
764
765_tagprog = None
766def splittag(url):
767 """splittag('/path#tag') --> '/path', 'tag'."""
768 global _tagprog
769 if _tagprog is None:
770 import re
771 _tagprog = re.compile('^(.*)#([^#]*)$')
772
773 match = _tagprog.match(url)
774 if match: return match.group(1, 2)
775 return url, None
776
777def splitattr(url):
778 """splitattr('/path;attr1=value1;attr2=value2;...') ->
779 '/path', ['attr1=value1', 'attr2=value2', ...]."""
780 words = url.split(';')
781 return words[0], words[1:]
782
783_valueprog = None
784def splitvalue(attr):
785 """splitvalue('attr=value') --> 'attr', 'value'."""
786 global _valueprog
787 if _valueprog is None:
788 import re
789 _valueprog = re.compile('^([^=]*)=(.*)$')
790
791 match = _valueprog.match(attr)
792 if match: return match.group(1, 2)
793 return attr, None