blob: 1f54ac6f99a91db3530f5e96d186661a23d329fe [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
Georg Brandlc62efa82010-07-11 10:41:07 +00008RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00009Berners-Lee, R. Fielding, and L. Masinter, August 1998.
10
Georg Brandlc62efa82010-07-11 10:41:07 +000011RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000012
13RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
141995.
15
Georg Brandlc62efa82010-07-11 10:41:07 +000016RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000017McCahill, December 1994
18
Georg Brandlc62efa82010-07-11 10:41:07 +000019RFC 3986 is considered the current standard and any future changes to
20urlparse module should conform with it. The urlparse module is
21currently not entirely compliant with this RFC due to defacto
22scenarios for parsing, and for backward compatibility purposes, some
23parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000024test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000025"""
26
Facundo Batista2ac5de22008-07-07 18:24:11 +000027import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000028import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000029
Jeremy Hylton1afc1692008-06-18 20:49:58 +000030__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000031 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032 "quote", "quote_plus", "quote_from_bytes",
33 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034
35# A classification of schemes ('' means apply by default)
36uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
37 'wais', 'file', 'https', 'shttp', 'mms',
38 'prospero', 'rtsp', 'rtspu', '', 'sftp']
39uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
40 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
41 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumarand4cd1882010-05-13 03:43:13 +000042 'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000043non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
44 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
45uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
46 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
47 'mms', '', 'sftp']
48uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
49 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
50uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
51 'nntp', 'wais', 'https', 'shttp', 'snews',
52 'file', 'prospero', '']
53
54# Characters valid in scheme names
55scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
56 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
57 '0123456789'
58 '+-.')
59
60MAX_CACHE_SIZE = 20
61_parse_cache = {}
62
63def clear_cache():
64 """Clear the parse cache."""
65 _parse_cache.clear()
66
67
68class ResultMixin(object):
69 """Shared methods for the parsed result objects."""
70
71 @property
72 def username(self):
73 netloc = self.netloc
74 if "@" in netloc:
75 userinfo = netloc.rsplit("@", 1)[0]
76 if ":" in userinfo:
77 userinfo = userinfo.split(":", 1)[0]
78 return userinfo
79 return None
80
81 @property
82 def password(self):
83 netloc = self.netloc
84 if "@" in netloc:
85 userinfo = netloc.rsplit("@", 1)[0]
86 if ":" in userinfo:
87 return userinfo.split(":", 1)[1]
88 return None
89
90 @property
91 def hostname(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +000092 netloc = self.netloc
93 if "@" in netloc:
94 netloc = netloc.rsplit("@", 1)[1]
95 if ":" in netloc:
96 netloc = netloc.split(":", 1)[0]
97 return netloc.lower() or None
Jeremy Hylton1afc1692008-06-18 20:49:58 +000098
99 @property
100 def port(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000101 netloc = self.netloc
102 if "@" in netloc:
103 netloc = netloc.rsplit("@", 1)[1]
104 if ":" in netloc:
105 port = netloc.split(":", 1)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000106 return int(port, 10)
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000107 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000108
109from collections import namedtuple
110
111class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
112
113 __slots__ = ()
114
115 def geturl(self):
116 return urlunsplit(self)
117
118
119class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
120
121 __slots__ = ()
122
123 def geturl(self):
124 return urlunparse(self)
125
126
127def urlparse(url, scheme='', allow_fragments=True):
128 """Parse a URL into 6 components:
129 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
130 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
131 Note that we don't break the components up in smaller bits
132 (e.g. netloc is a single string) and we don't expand % escapes."""
133 tuple = urlsplit(url, scheme, allow_fragments)
134 scheme, netloc, url, query, fragment = tuple
135 if scheme in uses_params and ';' in url:
136 url, params = _splitparams(url)
137 else:
138 params = ''
139 return ParseResult(scheme, netloc, url, params, query, fragment)
140
141def _splitparams(url):
142 if '/' in url:
143 i = url.find(';', url.rfind('/'))
144 if i < 0:
145 return url, ''
146 else:
147 i = url.find(';')
148 return url[:i], url[i+1:]
149
150def _splitnetloc(url, start=0):
151 delim = len(url) # position of end of domain part of url, default is end
152 for c in '/?#': # look for delimiters; the order is NOT important
153 wdelim = url.find(c, start) # find first of this delim
154 if wdelim >= 0: # if found
155 delim = min(delim, wdelim) # use earliest delim position
156 return url[start:delim], url[delim:] # return (domain, rest)
157
158def urlsplit(url, scheme='', allow_fragments=True):
159 """Parse a URL into 5 components:
160 <scheme>://<netloc>/<path>?<query>#<fragment>
161 Return a 5-tuple: (scheme, netloc, path, query, fragment).
162 Note that we don't break the components up in smaller bits
163 (e.g. netloc is a single string) and we don't expand % escapes."""
164 allow_fragments = bool(allow_fragments)
165 key = url, scheme, allow_fragments, type(url), type(scheme)
166 cached = _parse_cache.get(key, None)
167 if cached:
168 return cached
169 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
170 clear_cache()
171 netloc = query = fragment = ''
172 i = url.find(':')
173 if i > 0:
174 if url[:i] == 'http': # optimize the common case
175 scheme = url[:i].lower()
176 url = url[i+1:]
177 if url[:2] == '//':
178 netloc, url = _splitnetloc(url, 2)
179 if allow_fragments and '#' in url:
180 url, fragment = url.split('#', 1)
181 if '?' in url:
182 url, query = url.split('?', 1)
183 v = SplitResult(scheme, netloc, url, query, fragment)
184 _parse_cache[key] = v
185 return v
186 for c in url[:i]:
187 if c not in scheme_chars:
188 break
189 else:
190 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumarana8dbb242010-02-19 07:45:03 +0000191 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000192 netloc, url = _splitnetloc(url, 2)
193 if allow_fragments and scheme in uses_fragment and '#' in url:
194 url, fragment = url.split('#', 1)
195 if scheme in uses_query and '?' in url:
196 url, query = url.split('?', 1)
197 v = SplitResult(scheme, netloc, url, query, fragment)
198 _parse_cache[key] = v
199 return v
200
201def urlunparse(components):
202 """Put a parsed URL back together again. This may result in a
203 slightly different, but equivalent URL, if the URL that was parsed
204 originally had redundant delimiters, e.g. a ? with an empty query
205 (the draft states that these are equivalent)."""
206 scheme, netloc, url, params, query, fragment = components
207 if params:
208 url = "%s;%s" % (url, params)
209 return urlunsplit((scheme, netloc, url, query, fragment))
210
211def urlunsplit(components):
Senthil Kumaran930049b2010-06-28 14:12:18 +0000212 """Combine the elements of a tuple as returned by urlsplit() into a
213 complete URL as a string. The data argument can be any five-item iterable.
214 This may result in a slightly different, but equivalent URL, if the URL that
215 was parsed originally had unnecessary delimiters (for example, a ? with an
216 empty query; the RFC states that these are equivalent)."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000217 scheme, netloc, url, query, fragment = components
218 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
219 if url and url[:1] != '/': url = '/' + url
220 url = '//' + (netloc or '') + url
221 if scheme:
222 url = scheme + ':' + url
223 if query:
224 url = url + '?' + query
225 if fragment:
226 url = url + '#' + fragment
227 return url
228
229def urljoin(base, url, allow_fragments=True):
230 """Join a base URL and a possibly relative URL to form an absolute
231 interpretation of the latter."""
232 if not base:
233 return url
234 if not url:
235 return base
236 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
237 urlparse(base, '', allow_fragments)
238 scheme, netloc, path, params, query, fragment = \
239 urlparse(url, bscheme, allow_fragments)
240 if scheme != bscheme or scheme not in uses_relative:
241 return url
242 if scheme in uses_netloc:
243 if netloc:
244 return urlunparse((scheme, netloc, path,
245 params, query, fragment))
246 netloc = bnetloc
247 if path[:1] == '/':
248 return urlunparse((scheme, netloc, path,
249 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000250 if not path:
251 path = bpath
252 if not params:
253 params = bparams
254 else:
255 path = path[:-1]
256 return urlunparse((scheme, netloc, path,
257 params, query, fragment))
258 if not query:
259 query = bquery
260 return urlunparse((scheme, netloc, path,
261 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000262 segments = bpath.split('/')[:-1] + path.split('/')
263 # XXX The stuff below is bogus in various ways...
264 if segments[-1] == '.':
265 segments[-1] = ''
266 while '.' in segments:
267 segments.remove('.')
268 while 1:
269 i = 1
270 n = len(segments) - 1
271 while i < n:
272 if (segments[i] == '..'
273 and segments[i-1] not in ('', '..')):
274 del segments[i-1:i+1]
275 break
276 i = i+1
277 else:
278 break
279 if segments == ['', '..']:
280 segments[-1] = ''
281 elif len(segments) >= 2 and segments[-1] == '..':
282 segments[-2:] = ['']
283 return urlunparse((scheme, netloc, '/'.join(segments),
284 params, query, fragment))
285
286def urldefrag(url):
287 """Removes any existing fragment from URL.
288
289 Returns a tuple of the defragmented URL and the fragment. If
290 the URL contained no fragments, the second element is the
291 empty string.
292 """
293 if '#' in url:
294 s, n, p, a, q, frag = urlparse(url)
295 defrag = urlunparse((s, n, p, a, q, ''))
296 return defrag, frag
297 else:
298 return url, ''
299
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000300def unquote_to_bytes(string):
301 """unquote_to_bytes('abc%20def') -> b'abc def'."""
302 # Note: strings are encoded as UTF-8. This is only an issue if it contains
303 # unescaped non-ASCII characters, which URIs should not.
304 if isinstance(string, str):
305 string = string.encode('utf-8')
306 res = string.split(b'%')
307 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000308 for i in range(1, len(res)):
309 item = res[i]
310 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000311 res[i] = bytes([int(item[:2], 16)]) + item[2:]
312 except ValueError:
313 res[i] = b'%' + item
314 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000315
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000316def unquote(string, encoding='utf-8', errors='replace'):
317 """Replace %xx escapes by their single-character equivalent. The optional
318 encoding and errors parameters specify how to decode percent-encoded
319 sequences into Unicode characters, as accepted by the bytes.decode()
320 method.
321 By default, percent-encoded sequences are decoded with UTF-8, and invalid
322 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000323
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000324 unquote('abc%20def') -> 'abc def'.
325 """
326 if encoding is None: encoding = 'utf-8'
327 if errors is None: errors = 'replace'
328 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
329 # (list of single-byte bytes objects)
330 pct_sequence = []
331 res = string.split('%')
332 for i in range(1, len(res)):
333 item = res[i]
334 try:
335 if not item: raise ValueError
336 pct_sequence.append(bytes.fromhex(item[:2]))
337 rest = item[2:]
338 except ValueError:
339 rest = '%' + item
340 if not rest:
341 # This segment was just a single percent-encoded character.
342 # May be part of a sequence of code units, so delay decoding.
343 # (Stored in pct_sequence).
344 res[i] = ''
345 else:
346 # Encountered non-percent-encoded characters. Flush the current
347 # pct_sequence.
348 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
349 pct_sequence = []
350 if pct_sequence:
351 # Flush the final pct_sequence
352 # res[-1] will always be empty if pct_sequence != []
353 assert not res[-1], "string=%r, res=%r" % (string, res)
354 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
355 return ''.join(res)
356
Georg Brandlb044b2a2009-09-16 16:05:59 +0000357def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000358 """Parse a query given as a string argument.
359
360 Arguments:
361
362 qs: URL-encoded query string to be parsed
363
364 keep_blank_values: flag indicating whether blank values in
365 URL encoded queries should be treated as blank strings.
366 A true value indicates that blanks should be retained as
367 blank strings. The default false value indicates that
368 blank values are to be ignored and treated as if they were
369 not included.
370
371 strict_parsing: flag indicating what to do with parsing errors.
372 If false (the default), errors are silently ignored.
373 If true, errors raise a ValueError exception.
374 """
375 dict = {}
376 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
377 if name in dict:
378 dict[name].append(value)
379 else:
380 dict[name] = [value]
381 return dict
382
Georg Brandlb044b2a2009-09-16 16:05:59 +0000383def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000384 """Parse a query given as a string argument.
385
386 Arguments:
387
388 qs: URL-encoded query string to be parsed
389
390 keep_blank_values: flag indicating whether blank values in
391 URL encoded queries should be treated as blank strings. A
392 true value indicates that blanks should be retained as blank
393 strings. The default false value indicates that blank values
394 are to be ignored and treated as if they were not included.
395
396 strict_parsing: flag indicating what to do with parsing errors. If
397 false (the default), errors are silently ignored. If true,
398 errors raise a ValueError exception.
399
400 Returns a list, as G-d intended.
401 """
402 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
403 r = []
404 for name_value in pairs:
405 if not name_value and not strict_parsing:
406 continue
407 nv = name_value.split('=', 1)
408 if len(nv) != 2:
409 if strict_parsing:
410 raise ValueError("bad query field: %r" % (name_value,))
411 # Handle case of a control-name with no equal sign
412 if keep_blank_values:
413 nv.append('')
414 else:
415 continue
416 if len(nv[1]) or keep_blank_values:
417 name = unquote(nv[0].replace('+', ' '))
418 value = unquote(nv[1].replace('+', ' '))
419 r.append((name, value))
420
421 return r
422
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000423def unquote_plus(string, encoding='utf-8', errors='replace'):
424 """Like unquote(), but also replace plus signs by spaces, as required for
425 unquoting HTML form values.
426
427 unquote_plus('%7e/abc+def') -> '~/abc def'
428 """
429 string = string.replace('+', ' ')
430 return unquote(string, encoding, errors)
431
432_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
433 b'abcdefghijklmnopqrstuvwxyz'
434 b'0123456789'
435 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000436_safe_quoters= {}
437
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000438class Quoter(collections.defaultdict):
439 """A mapping from bytes (in range(0,256)) to strings.
440
441 String values are percent-encoded byte values, unless the key < 128, and
442 in the "safe" set (either the specified safe set, or default set).
443 """
444 # Keeps a cache internally, using defaultdict, for efficiency (lookups
445 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000446 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000447 """safe: bytes object."""
448 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000449
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000450 def __repr__(self):
451 # Without this, will just display as a defaultdict
452 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000453
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000454 def __missing__(self, b):
455 # Handle a cache miss. Store quoted string in cache and return.
456 res = b in self.safe and chr(b) or ('%%%02X' % b)
457 self[b] = res
458 return res
459
460def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000461 """quote('abc def') -> 'abc%20def'
462
463 Each part of a URL, e.g. the path info, the query, etc., has a
464 different set of reserved characters that must be quoted.
465
466 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
467 the following reserved characters.
468
469 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
470 "$" | ","
471
472 Each of these characters is reserved in some component of a URL,
473 but not necessarily in all of them.
474
475 By default, the quote function is intended for quoting the path
476 section of a URL. Thus, it will not encode '/'. This character
477 is reserved, but in typical usage the quote function is being
478 called on a path where the existing slash characters are used as
479 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000480
481 string and safe may be either str or bytes objects. encoding must
482 not be specified if string is a str.
483
484 The optional encoding and errors parameters specify how to deal with
485 non-ASCII characters, as accepted by the str.encode method.
486 By default, encoding='utf-8' (characters are encoded with UTF-8), and
487 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000488 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000489 if isinstance(string, str):
490 if encoding is None:
491 encoding = 'utf-8'
492 if errors is None:
493 errors = 'strict'
494 string = string.encode(encoding, errors)
495 else:
496 if encoding is not None:
497 raise TypeError("quote() doesn't support 'encoding' for bytes")
498 if errors is not None:
499 raise TypeError("quote() doesn't support 'errors' for bytes")
500 return quote_from_bytes(string, safe)
501
502def quote_plus(string, safe='', encoding=None, errors=None):
503 """Like quote(), but also replace ' ' with '+', as required for quoting
504 HTML form values. Plus signs in the original string are escaped unless
505 they are included in safe. It also does not have safe default to '/'.
506 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000507 # Check if ' ' in string, where string may either be a str or bytes. If
508 # there are no spaces, the regular quote will produce the right answer.
509 if ((isinstance(string, str) and ' ' not in string) or
510 (isinstance(string, bytes) and b' ' not in string)):
511 return quote(string, safe, encoding, errors)
512 if isinstance(safe, str):
513 space = ' '
514 else:
515 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000516 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000517 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000518
519def quote_from_bytes(bs, safe='/'):
520 """Like quote(), but accepts a bytes object rather than a str, and does
521 not perform string-to-bytes encoding. It always returns an ASCII string.
522 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
523 """
524 if isinstance(safe, str):
525 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
526 safe = safe.encode('ascii', 'ignore')
527 cachekey = bytes(safe) # In case it was a bytearray
528 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
529 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000530 try:
531 quoter = _safe_quoters[cachekey]
532 except KeyError:
533 quoter = Quoter(safe)
534 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000535 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000536
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000537def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000538 """Encode a sequence of two-element tuples or dictionary into a URL query string.
539
540 If any values in the query arg are sequences and doseq is true, each
541 sequence element is converted to a separate parameter.
542
543 If the query arg is a sequence of two-element tuples, the order of the
544 parameters in the output will match the order of parameters in the
545 input.
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000546
547 The query arg may be either a string or a bytes type. When query arg is a
548 string, the safe, encoding and error parameters are sent the quote_plus for
549 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000550 """
551
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000552 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000553 query = query.items()
554 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000555 # It's a bother at times that strings and string-like objects are
556 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000557 try:
558 # non-sequence items should not work with len()
559 # non-empty strings will fail this
560 if len(query) and not isinstance(query[0], tuple):
561 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000562 # Zero-length sequences of all types will get here and succeed,
563 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000564 # allowed empty dicts that type of behavior probably should be
565 # preserved for consistency
566 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000567 ty, va, tb = sys.exc_info()
568 raise TypeError("not a valid non-string sequence "
569 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000570
571 l = []
572 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000573 for k, v in query:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000574 if isinstance(k, bytes):
575 k = quote_plus(k, safe)
576 else:
577 k = quote_plus(str(k), safe, encoding, errors)
578
579 if isinstance(v, bytes):
580 v = quote_plus(v, safe)
581 else:
582 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000583 l.append(k + '=' + v)
584 else:
585 for k, v in query:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000586 if isinstance(k, bytes):
587 k = quote_plus(k, safe)
588 else:
589 k = quote_plus(str(k), safe, encoding, errors)
590
591 if isinstance(v, bytes):
592 v = quote_plus(v, safe)
593 l.append(k + '=' + v)
594 elif isinstance(v, str):
595 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000596 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000597 else:
598 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000599 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000600 x = len(v)
601 except TypeError:
602 # not a sequence
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000603 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000604 l.append(k + '=' + v)
605 else:
606 # loop over the sequence
607 for elt in v:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000608 if isinstance(elt, bytes):
609 elt = quote_plus(elt, safe)
610 else:
611 elt = quote_plus(str(elt), safe, encoding, errors)
612 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000613 return '&'.join(l)
614
615# Utilities to parse URLs (most of these return None for missing parts):
616# unwrap('<URL:type://host/path>') --> 'type://host/path'
617# splittype('type:opaquestring') --> 'type', 'opaquestring'
618# splithost('//host[:port]/path') --> 'host[:port]', '/path'
619# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
620# splitpasswd('user:passwd') -> 'user', 'passwd'
621# splitport('host:port') --> 'host', 'port'
622# splitquery('/path?query') --> '/path', 'query'
623# splittag('/path#tag') --> '/path', 'tag'
624# splitattr('/path;attr1=value1;attr2=value2;...') ->
625# '/path', ['attr1=value1', 'attr2=value2', ...]
626# splitvalue('attr=value') --> 'attr', 'value'
627# urllib.parse.unquote('abc%20def') -> 'abc def'
628# quote('abc def') -> 'abc%20def')
629
Georg Brandl13e89462008-07-01 19:56:00 +0000630def to_bytes(url):
631 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000632 # Most URL schemes require ASCII. If that changes, the conversion
633 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000634 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000635 if isinstance(url, str):
636 try:
637 url = url.encode("ASCII").decode()
638 except UnicodeError:
639 raise UnicodeError("URL " + repr(url) +
640 " contains non-ASCII characters")
641 return url
642
643def unwrap(url):
644 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
645 url = str(url).strip()
646 if url[:1] == '<' and url[-1:] == '>':
647 url = url[1:-1].strip()
648 if url[:4] == 'URL:': url = url[4:].strip()
649 return url
650
651_typeprog = None
652def splittype(url):
653 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
654 global _typeprog
655 if _typeprog is None:
656 import re
657 _typeprog = re.compile('^([^/:]+):')
658
659 match = _typeprog.match(url)
660 if match:
661 scheme = match.group(1)
662 return scheme.lower(), url[len(scheme) + 1:]
663 return None, url
664
665_hostprog = None
666def splithost(url):
667 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
668 global _hostprog
669 if _hostprog is None:
670 import re
671 _hostprog = re.compile('^//([^/?]*)(.*)$')
672
673 match = _hostprog.match(url)
674 if match: return match.group(1, 2)
675 return None, url
676
677_userprog = None
678def splituser(host):
679 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
680 global _userprog
681 if _userprog is None:
682 import re
683 _userprog = re.compile('^(.*)@(.*)$')
684
685 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000686 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000687 return None, host
688
689_passwdprog = None
690def splitpasswd(user):
691 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
692 global _passwdprog
693 if _passwdprog is None:
694 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000695 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000696
697 match = _passwdprog.match(user)
698 if match: return match.group(1, 2)
699 return user, None
700
701# splittag('/path#tag') --> '/path', 'tag'
702_portprog = None
703def splitport(host):
704 """splitport('host:port') --> 'host', 'port'."""
705 global _portprog
706 if _portprog is None:
707 import re
708 _portprog = re.compile('^(.*):([0-9]+)$')
709
710 match = _portprog.match(host)
711 if match: return match.group(1, 2)
712 return host, None
713
714_nportprog = None
715def splitnport(host, defport=-1):
716 """Split host and port, returning numeric port.
717 Return given default port if no ':' found; defaults to -1.
718 Return numerical port if a valid number are found after ':'.
719 Return None if ':' but not a valid number."""
720 global _nportprog
721 if _nportprog is None:
722 import re
723 _nportprog = re.compile('^(.*):(.*)$')
724
725 match = _nportprog.match(host)
726 if match:
727 host, port = match.group(1, 2)
728 try:
729 if not port: raise ValueError("no digits")
730 nport = int(port)
731 except ValueError:
732 nport = None
733 return host, nport
734 return host, defport
735
736_queryprog = None
737def splitquery(url):
738 """splitquery('/path?query') --> '/path', 'query'."""
739 global _queryprog
740 if _queryprog is None:
741 import re
742 _queryprog = re.compile('^(.*)\?([^?]*)$')
743
744 match = _queryprog.match(url)
745 if match: return match.group(1, 2)
746 return url, None
747
748_tagprog = None
749def splittag(url):
750 """splittag('/path#tag') --> '/path', 'tag'."""
751 global _tagprog
752 if _tagprog is None:
753 import re
754 _tagprog = re.compile('^(.*)#([^#]*)$')
755
756 match = _tagprog.match(url)
757 if match: return match.group(1, 2)
758 return url, None
759
760def splitattr(url):
761 """splitattr('/path;attr1=value1;attr2=value2;...') ->
762 '/path', ['attr1=value1', 'attr2=value2', ...]."""
763 words = url.split(';')
764 return words[0], words[1:]
765
766_valueprog = None
767def splitvalue(attr):
768 """splitvalue('attr=value') --> 'attr', 'value'."""
769 global _valueprog
770 if _valueprog is None:
771 import re
772 _valueprog = re.compile('^([^=]*)=(.*)$')
773
774 match = _valueprog.match(attr)
775 if match: return match.group(1, 2)
776 return attr, None