blob: e1afe528d1f5f5394a9e5a425d4bf2e22ffb6552 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
Georg Brandlc62efa82010-07-11 10:41:07 +00008RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00009Berners-Lee, R. Fielding, and L. Masinter, August 1998.
10
Georg Brandlc62efa82010-07-11 10:41:07 +000011RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000012
13RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
141995.
15
Georg Brandlc62efa82010-07-11 10:41:07 +000016RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000017McCahill, December 1994
18
Georg Brandlc62efa82010-07-11 10:41:07 +000019RFC 3986 is considered the current standard and any future changes to
20urlparse module should conform with it. The urlparse module is
21currently not entirely compliant with this RFC due to defacto
22scenarios for parsing, and for backward compatibility purposes, some
23parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +000024test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000025"""
26
Facundo Batista2ac5de22008-07-07 18:24:11 +000027import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000028import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000029
Jeremy Hylton1afc1692008-06-18 20:49:58 +000030__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000031 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000032 "quote", "quote_plus", "quote_from_bytes",
33 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000034
35# A classification of schemes ('' means apply by default)
36uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
37 'wais', 'file', 'https', 'shttp', 'mms',
38 'prospero', 'rtsp', 'rtspu', '', 'sftp']
39uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
40 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
41 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumarand4cd1882010-05-13 03:43:13 +000042 'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000043non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
44 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
45uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
46 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
47 'mms', '', 'sftp']
48uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
49 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
50uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
51 'nntp', 'wais', 'https', 'shttp', 'snews',
52 'file', 'prospero', '']
53
54# Characters valid in scheme names
55scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
56 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
57 '0123456789'
58 '+-.')
59
60MAX_CACHE_SIZE = 20
61_parse_cache = {}
62
63def clear_cache():
64 """Clear the parse cache."""
65 _parse_cache.clear()
66
67
68class ResultMixin(object):
69 """Shared methods for the parsed result objects."""
70
71 @property
72 def username(self):
73 netloc = self.netloc
74 if "@" in netloc:
75 userinfo = netloc.rsplit("@", 1)[0]
76 if ":" in userinfo:
77 userinfo = userinfo.split(":", 1)[0]
78 return userinfo
79 return None
80
81 @property
82 def password(self):
83 netloc = self.netloc
84 if "@" in netloc:
85 userinfo = netloc.rsplit("@", 1)[0]
86 if ":" in userinfo:
87 return userinfo.split(":", 1)[1]
88 return None
89
90 @property
91 def hostname(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +000092 netloc = self.netloc
93 if "@" in netloc:
94 netloc = netloc.rsplit("@", 1)[1]
95 if ":" in netloc:
96 netloc = netloc.split(":", 1)[0]
97 return netloc.lower() or None
Jeremy Hylton1afc1692008-06-18 20:49:58 +000098
99 @property
100 def port(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000101 netloc = self.netloc
102 if "@" in netloc:
103 netloc = netloc.rsplit("@", 1)[1]
104 if ":" in netloc:
105 port = netloc.split(":", 1)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000106 return int(port, 10)
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000107 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000108
109from collections import namedtuple
110
111class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
112
113 __slots__ = ()
114
115 def geturl(self):
116 return urlunsplit(self)
117
118
119class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
120
121 __slots__ = ()
122
123 def geturl(self):
124 return urlunparse(self)
125
126
127def urlparse(url, scheme='', allow_fragments=True):
128 """Parse a URL into 6 components:
129 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
130 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
131 Note that we don't break the components up in smaller bits
132 (e.g. netloc is a single string) and we don't expand % escapes."""
133 tuple = urlsplit(url, scheme, allow_fragments)
134 scheme, netloc, url, query, fragment = tuple
135 if scheme in uses_params and ';' in url:
136 url, params = _splitparams(url)
137 else:
138 params = ''
139 return ParseResult(scheme, netloc, url, params, query, fragment)
140
141def _splitparams(url):
142 if '/' in url:
143 i = url.find(';', url.rfind('/'))
144 if i < 0:
145 return url, ''
146 else:
147 i = url.find(';')
148 return url[:i], url[i+1:]
149
150def _splitnetloc(url, start=0):
151 delim = len(url) # position of end of domain part of url, default is end
152 for c in '/?#': # look for delimiters; the order is NOT important
153 wdelim = url.find(c, start) # find first of this delim
154 if wdelim >= 0: # if found
155 delim = min(delim, wdelim) # use earliest delim position
156 return url[start:delim], url[delim:] # return (domain, rest)
157
158def urlsplit(url, scheme='', allow_fragments=True):
159 """Parse a URL into 5 components:
160 <scheme>://<netloc>/<path>?<query>#<fragment>
161 Return a 5-tuple: (scheme, netloc, path, query, fragment).
162 Note that we don't break the components up in smaller bits
163 (e.g. netloc is a single string) and we don't expand % escapes."""
164 allow_fragments = bool(allow_fragments)
165 key = url, scheme, allow_fragments, type(url), type(scheme)
166 cached = _parse_cache.get(key, None)
167 if cached:
168 return cached
169 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
170 clear_cache()
171 netloc = query = fragment = ''
172 i = url.find(':')
173 if i > 0:
174 if url[:i] == 'http': # optimize the common case
175 scheme = url[:i].lower()
176 url = url[i+1:]
177 if url[:2] == '//':
178 netloc, url = _splitnetloc(url, 2)
179 if allow_fragments and '#' in url:
180 url, fragment = url.split('#', 1)
181 if '?' in url:
182 url, query = url.split('?', 1)
183 v = SplitResult(scheme, netloc, url, query, fragment)
184 _parse_cache[key] = v
185 return v
Senthil Kumaran8801f7a2010-08-04 04:53:07 +0000186 if url.endswith(':') or not url[i+1].isdigit():
187 for c in url[:i]:
188 if c not in scheme_chars:
189 break
190 else:
191 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumarana8dbb242010-02-19 07:45:03 +0000192 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000193 netloc, url = _splitnetloc(url, 2)
194 if allow_fragments and scheme in uses_fragment and '#' in url:
195 url, fragment = url.split('#', 1)
196 if scheme in uses_query and '?' in url:
197 url, query = url.split('?', 1)
198 v = SplitResult(scheme, netloc, url, query, fragment)
199 _parse_cache[key] = v
200 return v
201
202def urlunparse(components):
203 """Put a parsed URL back together again. This may result in a
204 slightly different, but equivalent URL, if the URL that was parsed
205 originally had redundant delimiters, e.g. a ? with an empty query
206 (the draft states that these are equivalent)."""
207 scheme, netloc, url, params, query, fragment = components
208 if params:
209 url = "%s;%s" % (url, params)
210 return urlunsplit((scheme, netloc, url, query, fragment))
211
212def urlunsplit(components):
Senthil Kumaran930049b2010-06-28 14:12:18 +0000213 """Combine the elements of a tuple as returned by urlsplit() into a
214 complete URL as a string. The data argument can be any five-item iterable.
215 This may result in a slightly different, but equivalent URL, if the URL that
216 was parsed originally had unnecessary delimiters (for example, a ? with an
217 empty query; the RFC states that these are equivalent)."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000218 scheme, netloc, url, query, fragment = components
219 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
220 if url and url[:1] != '/': url = '/' + url
221 url = '//' + (netloc or '') + url
222 if scheme:
223 url = scheme + ':' + url
224 if query:
225 url = url + '?' + query
226 if fragment:
227 url = url + '#' + fragment
228 return url
229
230def urljoin(base, url, allow_fragments=True):
231 """Join a base URL and a possibly relative URL to form an absolute
232 interpretation of the latter."""
233 if not base:
234 return url
235 if not url:
236 return base
237 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
238 urlparse(base, '', allow_fragments)
239 scheme, netloc, path, params, query, fragment = \
240 urlparse(url, bscheme, allow_fragments)
241 if scheme != bscheme or scheme not in uses_relative:
242 return url
243 if scheme in uses_netloc:
244 if netloc:
245 return urlunparse((scheme, netloc, path,
246 params, query, fragment))
247 netloc = bnetloc
248 if path[:1] == '/':
249 return urlunparse((scheme, netloc, path,
250 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000251 if not path:
252 path = bpath
253 if not params:
254 params = bparams
255 else:
256 path = path[:-1]
257 return urlunparse((scheme, netloc, path,
258 params, query, fragment))
259 if not query:
260 query = bquery
261 return urlunparse((scheme, netloc, path,
262 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000263 segments = bpath.split('/')[:-1] + path.split('/')
264 # XXX The stuff below is bogus in various ways...
265 if segments[-1] == '.':
266 segments[-1] = ''
267 while '.' in segments:
268 segments.remove('.')
269 while 1:
270 i = 1
271 n = len(segments) - 1
272 while i < n:
273 if (segments[i] == '..'
274 and segments[i-1] not in ('', '..')):
275 del segments[i-1:i+1]
276 break
277 i = i+1
278 else:
279 break
280 if segments == ['', '..']:
281 segments[-1] = ''
282 elif len(segments) >= 2 and segments[-1] == '..':
283 segments[-2:] = ['']
284 return urlunparse((scheme, netloc, '/'.join(segments),
285 params, query, fragment))
286
287def urldefrag(url):
288 """Removes any existing fragment from URL.
289
290 Returns a tuple of the defragmented URL and the fragment. If
291 the URL contained no fragments, the second element is the
292 empty string.
293 """
294 if '#' in url:
295 s, n, p, a, q, frag = urlparse(url)
296 defrag = urlunparse((s, n, p, a, q, ''))
297 return defrag, frag
298 else:
299 return url, ''
300
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000301def unquote_to_bytes(string):
302 """unquote_to_bytes('abc%20def') -> b'abc def'."""
303 # Note: strings are encoded as UTF-8. This is only an issue if it contains
304 # unescaped non-ASCII characters, which URIs should not.
305 if isinstance(string, str):
306 string = string.encode('utf-8')
307 res = string.split(b'%')
308 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000309 for i in range(1, len(res)):
310 item = res[i]
311 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000312 res[i] = bytes([int(item[:2], 16)]) + item[2:]
313 except ValueError:
314 res[i] = b'%' + item
315 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000316
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000317def unquote(string, encoding='utf-8', errors='replace'):
318 """Replace %xx escapes by their single-character equivalent. The optional
319 encoding and errors parameters specify how to decode percent-encoded
320 sequences into Unicode characters, as accepted by the bytes.decode()
321 method.
322 By default, percent-encoded sequences are decoded with UTF-8, and invalid
323 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000324
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000325 unquote('abc%20def') -> 'abc def'.
326 """
327 if encoding is None: encoding = 'utf-8'
328 if errors is None: errors = 'replace'
329 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
330 # (list of single-byte bytes objects)
331 pct_sequence = []
332 res = string.split('%')
333 for i in range(1, len(res)):
334 item = res[i]
335 try:
336 if not item: raise ValueError
337 pct_sequence.append(bytes.fromhex(item[:2]))
338 rest = item[2:]
339 except ValueError:
340 rest = '%' + item
341 if not rest:
342 # This segment was just a single percent-encoded character.
343 # May be part of a sequence of code units, so delay decoding.
344 # (Stored in pct_sequence).
345 res[i] = ''
346 else:
347 # Encountered non-percent-encoded characters. Flush the current
348 # pct_sequence.
349 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
350 pct_sequence = []
351 if pct_sequence:
352 # Flush the final pct_sequence
353 # res[-1] will always be empty if pct_sequence != []
354 assert not res[-1], "string=%r, res=%r" % (string, res)
355 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
356 return ''.join(res)
357
Georg Brandlb044b2a2009-09-16 16:05:59 +0000358def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000359 """Parse a query given as a string argument.
360
361 Arguments:
362
363 qs: URL-encoded query string to be parsed
364
365 keep_blank_values: flag indicating whether blank values in
366 URL encoded queries should be treated as blank strings.
367 A true value indicates that blanks should be retained as
368 blank strings. The default false value indicates that
369 blank values are to be ignored and treated as if they were
370 not included.
371
372 strict_parsing: flag indicating what to do with parsing errors.
373 If false (the default), errors are silently ignored.
374 If true, errors raise a ValueError exception.
375 """
376 dict = {}
377 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
378 if name in dict:
379 dict[name].append(value)
380 else:
381 dict[name] = [value]
382 return dict
383
Georg Brandlb044b2a2009-09-16 16:05:59 +0000384def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000385 """Parse a query given as a string argument.
386
387 Arguments:
388
389 qs: URL-encoded query string to be parsed
390
391 keep_blank_values: flag indicating whether blank values in
392 URL encoded queries should be treated as blank strings. A
393 true value indicates that blanks should be retained as blank
394 strings. The default false value indicates that blank values
395 are to be ignored and treated as if they were not included.
396
397 strict_parsing: flag indicating what to do with parsing errors. If
398 false (the default), errors are silently ignored. If true,
399 errors raise a ValueError exception.
400
401 Returns a list, as G-d intended.
402 """
403 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
404 r = []
405 for name_value in pairs:
406 if not name_value and not strict_parsing:
407 continue
408 nv = name_value.split('=', 1)
409 if len(nv) != 2:
410 if strict_parsing:
411 raise ValueError("bad query field: %r" % (name_value,))
412 # Handle case of a control-name with no equal sign
413 if keep_blank_values:
414 nv.append('')
415 else:
416 continue
417 if len(nv[1]) or keep_blank_values:
418 name = unquote(nv[0].replace('+', ' '))
419 value = unquote(nv[1].replace('+', ' '))
420 r.append((name, value))
421
422 return r
423
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000424def unquote_plus(string, encoding='utf-8', errors='replace'):
425 """Like unquote(), but also replace plus signs by spaces, as required for
426 unquoting HTML form values.
427
428 unquote_plus('%7e/abc+def') -> '~/abc def'
429 """
430 string = string.replace('+', ' ')
431 return unquote(string, encoding, errors)
432
433_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
434 b'abcdefghijklmnopqrstuvwxyz'
435 b'0123456789'
436 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000437_safe_quoters= {}
438
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000439class Quoter(collections.defaultdict):
440 """A mapping from bytes (in range(0,256)) to strings.
441
442 String values are percent-encoded byte values, unless the key < 128, and
443 in the "safe" set (either the specified safe set, or default set).
444 """
445 # Keeps a cache internally, using defaultdict, for efficiency (lookups
446 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000447 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000448 """safe: bytes object."""
449 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000450
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000451 def __repr__(self):
452 # Without this, will just display as a defaultdict
453 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000454
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000455 def __missing__(self, b):
456 # Handle a cache miss. Store quoted string in cache and return.
457 res = b in self.safe and chr(b) or ('%%%02X' % b)
458 self[b] = res
459 return res
460
461def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000462 """quote('abc def') -> 'abc%20def'
463
464 Each part of a URL, e.g. the path info, the query, etc., has a
465 different set of reserved characters that must be quoted.
466
467 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
468 the following reserved characters.
469
470 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
471 "$" | ","
472
473 Each of these characters is reserved in some component of a URL,
474 but not necessarily in all of them.
475
476 By default, the quote function is intended for quoting the path
477 section of a URL. Thus, it will not encode '/'. This character
478 is reserved, but in typical usage the quote function is being
479 called on a path where the existing slash characters are used as
480 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000481
482 string and safe may be either str or bytes objects. encoding must
483 not be specified if string is a str.
484
485 The optional encoding and errors parameters specify how to deal with
486 non-ASCII characters, as accepted by the str.encode method.
487 By default, encoding='utf-8' (characters are encoded with UTF-8), and
488 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000489 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000490 if isinstance(string, str):
491 if encoding is None:
492 encoding = 'utf-8'
493 if errors is None:
494 errors = 'strict'
495 string = string.encode(encoding, errors)
496 else:
497 if encoding is not None:
498 raise TypeError("quote() doesn't support 'encoding' for bytes")
499 if errors is not None:
500 raise TypeError("quote() doesn't support 'errors' for bytes")
501 return quote_from_bytes(string, safe)
502
503def quote_plus(string, safe='', encoding=None, errors=None):
504 """Like quote(), but also replace ' ' with '+', as required for quoting
505 HTML form values. Plus signs in the original string are escaped unless
506 they are included in safe. It also does not have safe default to '/'.
507 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000508 # Check if ' ' in string, where string may either be a str or bytes. If
509 # there are no spaces, the regular quote will produce the right answer.
510 if ((isinstance(string, str) and ' ' not in string) or
511 (isinstance(string, bytes) and b' ' not in string)):
512 return quote(string, safe, encoding, errors)
513 if isinstance(safe, str):
514 space = ' '
515 else:
516 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000517 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000518 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000519
520def quote_from_bytes(bs, safe='/'):
521 """Like quote(), but accepts a bytes object rather than a str, and does
522 not perform string-to-bytes encoding. It always returns an ASCII string.
523 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
524 """
525 if isinstance(safe, str):
526 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
527 safe = safe.encode('ascii', 'ignore')
528 cachekey = bytes(safe) # In case it was a bytearray
529 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
530 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000531 try:
532 quoter = _safe_quoters[cachekey]
533 except KeyError:
534 quoter = Quoter(safe)
535 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000536 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000537
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000538def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000539 """Encode a sequence of two-element tuples or dictionary into a URL query string.
540
541 If any values in the query arg are sequences and doseq is true, each
542 sequence element is converted to a separate parameter.
543
544 If the query arg is a sequence of two-element tuples, the order of the
545 parameters in the output will match the order of parameters in the
546 input.
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000547
548 The query arg may be either a string or a bytes type. When query arg is a
549 string, the safe, encoding and error parameters are sent the quote_plus for
550 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000551 """
552
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000553 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000554 query = query.items()
555 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000556 # It's a bother at times that strings and string-like objects are
557 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000558 try:
559 # non-sequence items should not work with len()
560 # non-empty strings will fail this
561 if len(query) and not isinstance(query[0], tuple):
562 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000563 # Zero-length sequences of all types will get here and succeed,
564 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000565 # allowed empty dicts that type of behavior probably should be
566 # preserved for consistency
567 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000568 ty, va, tb = sys.exc_info()
569 raise TypeError("not a valid non-string sequence "
570 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000571
572 l = []
573 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000574 for k, v in query:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000575 if isinstance(k, bytes):
576 k = quote_plus(k, safe)
577 else:
578 k = quote_plus(str(k), safe, encoding, errors)
579
580 if isinstance(v, bytes):
581 v = quote_plus(v, safe)
582 else:
583 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000584 l.append(k + '=' + v)
585 else:
586 for k, v in query:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000587 if isinstance(k, bytes):
588 k = quote_plus(k, safe)
589 else:
590 k = quote_plus(str(k), safe, encoding, errors)
591
592 if isinstance(v, bytes):
593 v = quote_plus(v, safe)
594 l.append(k + '=' + v)
595 elif isinstance(v, str):
596 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000597 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000598 else:
599 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000600 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000601 x = len(v)
602 except TypeError:
603 # not a sequence
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000604 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000605 l.append(k + '=' + v)
606 else:
607 # loop over the sequence
608 for elt in v:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000609 if isinstance(elt, bytes):
610 elt = quote_plus(elt, safe)
611 else:
612 elt = quote_plus(str(elt), safe, encoding, errors)
613 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000614 return '&'.join(l)
615
616# Utilities to parse URLs (most of these return None for missing parts):
617# unwrap('<URL:type://host/path>') --> 'type://host/path'
618# splittype('type:opaquestring') --> 'type', 'opaquestring'
619# splithost('//host[:port]/path') --> 'host[:port]', '/path'
620# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
621# splitpasswd('user:passwd') -> 'user', 'passwd'
622# splitport('host:port') --> 'host', 'port'
623# splitquery('/path?query') --> '/path', 'query'
624# splittag('/path#tag') --> '/path', 'tag'
625# splitattr('/path;attr1=value1;attr2=value2;...') ->
626# '/path', ['attr1=value1', 'attr2=value2', ...]
627# splitvalue('attr=value') --> 'attr', 'value'
628# urllib.parse.unquote('abc%20def') -> 'abc def'
629# quote('abc def') -> 'abc%20def')
630
Georg Brandl13e89462008-07-01 19:56:00 +0000631def to_bytes(url):
632 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000633 # Most URL schemes require ASCII. If that changes, the conversion
634 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000635 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000636 if isinstance(url, str):
637 try:
638 url = url.encode("ASCII").decode()
639 except UnicodeError:
640 raise UnicodeError("URL " + repr(url) +
641 " contains non-ASCII characters")
642 return url
643
644def unwrap(url):
645 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
646 url = str(url).strip()
647 if url[:1] == '<' and url[-1:] == '>':
648 url = url[1:-1].strip()
649 if url[:4] == 'URL:': url = url[4:].strip()
650 return url
651
652_typeprog = None
653def splittype(url):
654 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
655 global _typeprog
656 if _typeprog is None:
657 import re
658 _typeprog = re.compile('^([^/:]+):')
659
660 match = _typeprog.match(url)
661 if match:
662 scheme = match.group(1)
663 return scheme.lower(), url[len(scheme) + 1:]
664 return None, url
665
666_hostprog = None
667def splithost(url):
668 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
669 global _hostprog
670 if _hostprog is None:
671 import re
672 _hostprog = re.compile('^//([^/?]*)(.*)$')
673
674 match = _hostprog.match(url)
675 if match: return match.group(1, 2)
676 return None, url
677
678_userprog = None
679def splituser(host):
680 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
681 global _userprog
682 if _userprog is None:
683 import re
684 _userprog = re.compile('^(.*)@(.*)$')
685
686 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000687 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000688 return None, host
689
690_passwdprog = None
691def splitpasswd(user):
692 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
693 global _passwdprog
694 if _passwdprog is None:
695 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000696 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000697
698 match = _passwdprog.match(user)
699 if match: return match.group(1, 2)
700 return user, None
701
702# splittag('/path#tag') --> '/path', 'tag'
703_portprog = None
704def splitport(host):
705 """splitport('host:port') --> 'host', 'port'."""
706 global _portprog
707 if _portprog is None:
708 import re
709 _portprog = re.compile('^(.*):([0-9]+)$')
710
711 match = _portprog.match(host)
712 if match: return match.group(1, 2)
713 return host, None
714
715_nportprog = None
716def splitnport(host, defport=-1):
717 """Split host and port, returning numeric port.
718 Return given default port if no ':' found; defaults to -1.
719 Return numerical port if a valid number are found after ':'.
720 Return None if ':' but not a valid number."""
721 global _nportprog
722 if _nportprog is None:
723 import re
724 _nportprog = re.compile('^(.*):(.*)$')
725
726 match = _nportprog.match(host)
727 if match:
728 host, port = match.group(1, 2)
729 try:
730 if not port: raise ValueError("no digits")
731 nport = int(port)
732 except ValueError:
733 nport = None
734 return host, nport
735 return host, defport
736
737_queryprog = None
738def splitquery(url):
739 """splitquery('/path?query') --> '/path', 'query'."""
740 global _queryprog
741 if _queryprog is None:
742 import re
743 _queryprog = re.compile('^(.*)\?([^?]*)$')
744
745 match = _queryprog.match(url)
746 if match: return match.group(1, 2)
747 return url, None
748
749_tagprog = None
750def splittag(url):
751 """splittag('/path#tag') --> '/path', 'tag'."""
752 global _tagprog
753 if _tagprog is None:
754 import re
755 _tagprog = re.compile('^(.*)#([^#]*)$')
756
757 match = _tagprog.match(url)
758 if match: return match.group(1, 2)
759 return url, None
760
761def splitattr(url):
762 """splitattr('/path;attr1=value1;attr2=value2;...') ->
763 '/path', ['attr1=value1', 'attr2=value2', ...]."""
764 words = url.split(';')
765 return words[0], words[1:]
766
767_valueprog = None
768def splitvalue(attr):
769 """splitvalue('attr=value') --> 'attr', 'value'."""
770 global _valueprog
771 if _valueprog is None:
772 import re
773 _valueprog = re.compile('^([^=]*)=(.*)$')
774
775 match = _valueprog.match(attr)
776 if match: return match.group(1, 2)
777 return attr, None