blob: 3d541a72063406c6b1a46e25a840e030dbbc1e83 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any changes to urlparse module
23should conform to this. urlparse module is not entirely compliant with this.
24The defacto scenarios of parsing are considered sometimes and for backward
25compatiblity purposes, older RFC uses of parsing are retained. The testcases in
26test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000027"""
28
Facundo Batista2ac5de22008-07-07 18:24:11 +000029import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000030import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000031
Jeremy Hylton1afc1692008-06-18 20:49:58 +000032__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000033 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000034 "quote", "quote_plus", "quote_from_bytes",
35 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37# A classification of schemes ('' means apply by default)
38uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
39 'wais', 'file', 'https', 'shttp', 'mms',
40 'prospero', 'rtsp', 'rtspu', '', 'sftp']
41uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
42 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
43 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaraneaaec272009-03-30 21:54:41 +000044 'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000045non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
46 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
47uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
48 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
49 'mms', '', 'sftp']
50uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
51 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
52uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
53 'nntp', 'wais', 'https', 'shttp', 'snews',
54 'file', 'prospero', '']
55
56# Characters valid in scheme names
57scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
58 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
59 '0123456789'
60 '+-.')
61
62MAX_CACHE_SIZE = 20
63_parse_cache = {}
64
65def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000066 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000067 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000068 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000069
70
71class ResultMixin(object):
72 """Shared methods for the parsed result objects."""
73
74 @property
75 def username(self):
76 netloc = self.netloc
77 if "@" in netloc:
78 userinfo = netloc.rsplit("@", 1)[0]
79 if ":" in userinfo:
80 userinfo = userinfo.split(":", 1)[0]
81 return userinfo
82 return None
83
84 @property
85 def password(self):
86 netloc = self.netloc
87 if "@" in netloc:
88 userinfo = netloc.rsplit("@", 1)[0]
89 if ":" in userinfo:
90 return userinfo.split(":", 1)[1]
91 return None
92
93 @property
94 def hostname(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +000095 netloc = self.netloc.split('@')[-1]
96 if '[' in netloc and ']' in netloc:
97 return netloc.split(']')[0][1:].lower()
Senthil Kumaranad02d232010-04-16 03:02:13 +000098 elif ':' in netloc:
99 return netloc.split(':')[0].lower()
100 elif netloc == '':
101 return None
102 else:
103 return netloc.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000104
105 @property
106 def port(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +0000107 netloc = self.netloc.split('@')[-1].split(']')[-1]
108 if ':' in netloc:
109 port = netloc.split(':')[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110 return int(port, 10)
Senthil Kumaranad02d232010-04-16 03:02:13 +0000111 else:
112 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114from collections import namedtuple
115
116class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
117
118 __slots__ = ()
119
120 def geturl(self):
121 return urlunsplit(self)
122
123
124class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
125
126 __slots__ = ()
127
128 def geturl(self):
129 return urlunparse(self)
130
131
132def urlparse(url, scheme='', allow_fragments=True):
133 """Parse a URL into 6 components:
134 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
135 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
136 Note that we don't break the components up in smaller bits
137 (e.g. netloc is a single string) and we don't expand % escapes."""
138 tuple = urlsplit(url, scheme, allow_fragments)
139 scheme, netloc, url, query, fragment = tuple
140 if scheme in uses_params and ';' in url:
141 url, params = _splitparams(url)
142 else:
143 params = ''
144 return ParseResult(scheme, netloc, url, params, query, fragment)
145
146def _splitparams(url):
147 if '/' in url:
148 i = url.find(';', url.rfind('/'))
149 if i < 0:
150 return url, ''
151 else:
152 i = url.find(';')
153 return url[:i], url[i+1:]
154
155def _splitnetloc(url, start=0):
156 delim = len(url) # position of end of domain part of url, default is end
157 for c in '/?#': # look for delimiters; the order is NOT important
158 wdelim = url.find(c, start) # find first of this delim
159 if wdelim >= 0: # if found
160 delim = min(delim, wdelim) # use earliest delim position
161 return url[start:delim], url[delim:] # return (domain, rest)
162
163def urlsplit(url, scheme='', allow_fragments=True):
164 """Parse a URL into 5 components:
165 <scheme>://<netloc>/<path>?<query>#<fragment>
166 Return a 5-tuple: (scheme, netloc, path, query, fragment).
167 Note that we don't break the components up in smaller bits
168 (e.g. netloc is a single string) and we don't expand % escapes."""
169 allow_fragments = bool(allow_fragments)
170 key = url, scheme, allow_fragments, type(url), type(scheme)
171 cached = _parse_cache.get(key, None)
172 if cached:
173 return cached
174 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
175 clear_cache()
176 netloc = query = fragment = ''
177 i = url.find(':')
178 if i > 0:
179 if url[:i] == 'http': # optimize the common case
180 scheme = url[:i].lower()
181 url = url[i+1:]
182 if url[:2] == '//':
183 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000184 if (('[' in netloc and ']' not in netloc) or
185 (']' in netloc and '[' not in netloc)):
186 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000187 if allow_fragments and '#' in url:
188 url, fragment = url.split('#', 1)
189 if '?' in url:
190 url, query = url.split('?', 1)
191 v = SplitResult(scheme, netloc, url, query, fragment)
192 _parse_cache[key] = v
193 return v
194 for c in url[:i]:
195 if c not in scheme_chars:
196 break
197 else:
198 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000199 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000200 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000201 if (('[' in netloc and ']' not in netloc) or
202 (']' in netloc and '[' not in netloc)):
203 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000204 if allow_fragments and scheme in uses_fragment and '#' in url:
205 url, fragment = url.split('#', 1)
206 if scheme in uses_query and '?' in url:
207 url, query = url.split('?', 1)
208 v = SplitResult(scheme, netloc, url, query, fragment)
209 _parse_cache[key] = v
210 return v
211
212def urlunparse(components):
213 """Put a parsed URL back together again. This may result in a
214 slightly different, but equivalent URL, if the URL that was parsed
215 originally had redundant delimiters, e.g. a ? with an empty query
216 (the draft states that these are equivalent)."""
217 scheme, netloc, url, params, query, fragment = components
218 if params:
219 url = "%s;%s" % (url, params)
220 return urlunsplit((scheme, netloc, url, query, fragment))
221
222def urlunsplit(components):
223 scheme, netloc, url, query, fragment = components
224 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
225 if url and url[:1] != '/': url = '/' + url
226 url = '//' + (netloc or '') + url
227 if scheme:
228 url = scheme + ':' + url
229 if query:
230 url = url + '?' + query
231 if fragment:
232 url = url + '#' + fragment
233 return url
234
235def urljoin(base, url, allow_fragments=True):
236 """Join a base URL and a possibly relative URL to form an absolute
237 interpretation of the latter."""
238 if not base:
239 return url
240 if not url:
241 return base
242 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
243 urlparse(base, '', allow_fragments)
244 scheme, netloc, path, params, query, fragment = \
245 urlparse(url, bscheme, allow_fragments)
246 if scheme != bscheme or scheme not in uses_relative:
247 return url
248 if scheme in uses_netloc:
249 if netloc:
250 return urlunparse((scheme, netloc, path,
251 params, query, fragment))
252 netloc = bnetloc
253 if path[:1] == '/':
254 return urlunparse((scheme, netloc, path,
255 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000256 if not path:
257 path = bpath
258 if not params:
259 params = bparams
260 else:
261 path = path[:-1]
262 return urlunparse((scheme, netloc, path,
263 params, query, fragment))
264 if not query:
265 query = bquery
266 return urlunparse((scheme, netloc, path,
267 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000268 segments = bpath.split('/')[:-1] + path.split('/')
269 # XXX The stuff below is bogus in various ways...
270 if segments[-1] == '.':
271 segments[-1] = ''
272 while '.' in segments:
273 segments.remove('.')
274 while 1:
275 i = 1
276 n = len(segments) - 1
277 while i < n:
278 if (segments[i] == '..'
279 and segments[i-1] not in ('', '..')):
280 del segments[i-1:i+1]
281 break
282 i = i+1
283 else:
284 break
285 if segments == ['', '..']:
286 segments[-1] = ''
287 elif len(segments) >= 2 and segments[-1] == '..':
288 segments[-2:] = ['']
289 return urlunparse((scheme, netloc, '/'.join(segments),
290 params, query, fragment))
291
292def urldefrag(url):
293 """Removes any existing fragment from URL.
294
295 Returns a tuple of the defragmented URL and the fragment. If
296 the URL contained no fragments, the second element is the
297 empty string.
298 """
299 if '#' in url:
300 s, n, p, a, q, frag = urlparse(url)
301 defrag = urlunparse((s, n, p, a, q, ''))
302 return defrag, frag
303 else:
304 return url, ''
305
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000306def unquote_to_bytes(string):
307 """unquote_to_bytes('abc%20def') -> b'abc def'."""
308 # Note: strings are encoded as UTF-8. This is only an issue if it contains
309 # unescaped non-ASCII characters, which URIs should not.
310 if isinstance(string, str):
311 string = string.encode('utf-8')
312 res = string.split(b'%')
313 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000314 for i in range(1, len(res)):
315 item = res[i]
316 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000317 res[i] = bytes([int(item[:2], 16)]) + item[2:]
318 except ValueError:
319 res[i] = b'%' + item
320 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000321
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000322def unquote(string, encoding='utf-8', errors='replace'):
323 """Replace %xx escapes by their single-character equivalent. The optional
324 encoding and errors parameters specify how to decode percent-encoded
325 sequences into Unicode characters, as accepted by the bytes.decode()
326 method.
327 By default, percent-encoded sequences are decoded with UTF-8, and invalid
328 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000329
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000330 unquote('abc%20def') -> 'abc def'.
331 """
332 if encoding is None: encoding = 'utf-8'
333 if errors is None: errors = 'replace'
334 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
335 # (list of single-byte bytes objects)
336 pct_sequence = []
337 res = string.split('%')
338 for i in range(1, len(res)):
339 item = res[i]
340 try:
341 if not item: raise ValueError
342 pct_sequence.append(bytes.fromhex(item[:2]))
343 rest = item[2:]
344 except ValueError:
345 rest = '%' + item
346 if not rest:
347 # This segment was just a single percent-encoded character.
348 # May be part of a sequence of code units, so delay decoding.
349 # (Stored in pct_sequence).
350 res[i] = ''
351 else:
352 # Encountered non-percent-encoded characters. Flush the current
353 # pct_sequence.
354 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
355 pct_sequence = []
356 if pct_sequence:
357 # Flush the final pct_sequence
358 # res[-1] will always be empty if pct_sequence != []
359 assert not res[-1], "string=%r, res=%r" % (string, res)
360 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
361 return ''.join(res)
362
Georg Brandl3d6575d2009-09-16 14:36:22 +0000363def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000364 """Parse a query given as a string argument.
365
366 Arguments:
367
368 qs: URL-encoded query string to be parsed
369
370 keep_blank_values: flag indicating whether blank values in
371 URL encoded queries should be treated as blank strings.
372 A true value indicates that blanks should be retained as
373 blank strings. The default false value indicates that
374 blank values are to be ignored and treated as if they were
375 not included.
376
377 strict_parsing: flag indicating what to do with parsing errors.
378 If false (the default), errors are silently ignored.
379 If true, errors raise a ValueError exception.
380 """
381 dict = {}
382 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
383 if name in dict:
384 dict[name].append(value)
385 else:
386 dict[name] = [value]
387 return dict
388
Georg Brandl3d6575d2009-09-16 14:36:22 +0000389def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000390 """Parse a query given as a string argument.
391
392 Arguments:
393
394 qs: URL-encoded query string to be parsed
395
396 keep_blank_values: flag indicating whether blank values in
397 URL encoded queries should be treated as blank strings. A
398 true value indicates that blanks should be retained as blank
399 strings. The default false value indicates that blank values
400 are to be ignored and treated as if they were not included.
401
402 strict_parsing: flag indicating what to do with parsing errors. If
403 false (the default), errors are silently ignored. If true,
404 errors raise a ValueError exception.
405
406 Returns a list, as G-d intended.
407 """
408 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
409 r = []
410 for name_value in pairs:
411 if not name_value and not strict_parsing:
412 continue
413 nv = name_value.split('=', 1)
414 if len(nv) != 2:
415 if strict_parsing:
416 raise ValueError("bad query field: %r" % (name_value,))
417 # Handle case of a control-name with no equal sign
418 if keep_blank_values:
419 nv.append('')
420 else:
421 continue
422 if len(nv[1]) or keep_blank_values:
423 name = unquote(nv[0].replace('+', ' '))
424 value = unquote(nv[1].replace('+', ' '))
425 r.append((name, value))
426
427 return r
428
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000429def unquote_plus(string, encoding='utf-8', errors='replace'):
430 """Like unquote(), but also replace plus signs by spaces, as required for
431 unquoting HTML form values.
432
433 unquote_plus('%7e/abc+def') -> '~/abc def'
434 """
435 string = string.replace('+', ' ')
436 return unquote(string, encoding, errors)
437
438_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
439 b'abcdefghijklmnopqrstuvwxyz'
440 b'0123456789'
441 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000442_safe_quoters= {}
443
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000444class Quoter(collections.defaultdict):
445 """A mapping from bytes (in range(0,256)) to strings.
446
447 String values are percent-encoded byte values, unless the key < 128, and
448 in the "safe" set (either the specified safe set, or default set).
449 """
450 # Keeps a cache internally, using defaultdict, for efficiency (lookups
451 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000452 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000453 """safe: bytes object."""
454 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000455
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000456 def __repr__(self):
457 # Without this, will just display as a defaultdict
458 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000459
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000460 def __missing__(self, b):
461 # Handle a cache miss. Store quoted string in cache and return.
462 res = b in self.safe and chr(b) or ('%%%02X' % b)
463 self[b] = res
464 return res
465
466def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000467 """quote('abc def') -> 'abc%20def'
468
469 Each part of a URL, e.g. the path info, the query, etc., has a
470 different set of reserved characters that must be quoted.
471
472 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
473 the following reserved characters.
474
475 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
476 "$" | ","
477
478 Each of these characters is reserved in some component of a URL,
479 but not necessarily in all of them.
480
481 By default, the quote function is intended for quoting the path
482 section of a URL. Thus, it will not encode '/'. This character
483 is reserved, but in typical usage the quote function is being
484 called on a path where the existing slash characters are used as
485 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000486
487 string and safe may be either str or bytes objects. encoding must
488 not be specified if string is a str.
489
490 The optional encoding and errors parameters specify how to deal with
491 non-ASCII characters, as accepted by the str.encode method.
492 By default, encoding='utf-8' (characters are encoded with UTF-8), and
493 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000494 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000495 if isinstance(string, str):
496 if encoding is None:
497 encoding = 'utf-8'
498 if errors is None:
499 errors = 'strict'
500 string = string.encode(encoding, errors)
501 else:
502 if encoding is not None:
503 raise TypeError("quote() doesn't support 'encoding' for bytes")
504 if errors is not None:
505 raise TypeError("quote() doesn't support 'errors' for bytes")
506 return quote_from_bytes(string, safe)
507
508def quote_plus(string, safe='', encoding=None, errors=None):
509 """Like quote(), but also replace ' ' with '+', as required for quoting
510 HTML form values. Plus signs in the original string are escaped unless
511 they are included in safe. It also does not have safe default to '/'.
512 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000513 # Check if ' ' in string, where string may either be a str or bytes. If
514 # there are no spaces, the regular quote will produce the right answer.
515 if ((isinstance(string, str) and ' ' not in string) or
516 (isinstance(string, bytes) and b' ' not in string)):
517 return quote(string, safe, encoding, errors)
518 if isinstance(safe, str):
519 space = ' '
520 else:
521 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000522 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000523 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000524
525def quote_from_bytes(bs, safe='/'):
526 """Like quote(), but accepts a bytes object rather than a str, and does
527 not perform string-to-bytes encoding. It always returns an ASCII string.
528 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
529 """
530 if isinstance(safe, str):
531 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
532 safe = safe.encode('ascii', 'ignore')
533 cachekey = bytes(safe) # In case it was a bytearray
534 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
535 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000536 try:
537 quoter = _safe_quoters[cachekey]
538 except KeyError:
539 quoter = Quoter(safe)
540 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000541 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000542
Georg Brandl3d6575d2009-09-16 14:36:22 +0000543def urlencode(query, doseq=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000544 """Encode a sequence of two-element tuples or dictionary into a URL query string.
545
546 If any values in the query arg are sequences and doseq is true, each
547 sequence element is converted to a separate parameter.
548
549 If the query arg is a sequence of two-element tuples, the order of the
550 parameters in the output will match the order of parameters in the
551 input.
552 """
553
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000554 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000555 query = query.items()
556 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000557 # It's a bother at times that strings and string-like objects are
558 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000559 try:
560 # non-sequence items should not work with len()
561 # non-empty strings will fail this
562 if len(query) and not isinstance(query[0], tuple):
563 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000564 # Zero-length sequences of all types will get here and succeed,
565 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000566 # allowed empty dicts that type of behavior probably should be
567 # preserved for consistency
568 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000569 ty, va, tb = sys.exc_info()
570 raise TypeError("not a valid non-string sequence "
571 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000572
573 l = []
574 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000575 for k, v in query:
576 k = quote_plus(str(k))
577 v = quote_plus(str(v))
578 l.append(k + '=' + v)
579 else:
580 for k, v in query:
581 k = quote_plus(str(k))
582 if isinstance(v, str):
583 v = quote_plus(v)
584 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000585 else:
586 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000587 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000588 x = len(v)
589 except TypeError:
590 # not a sequence
591 v = quote_plus(str(v))
592 l.append(k + '=' + v)
593 else:
594 # loop over the sequence
595 for elt in v:
596 l.append(k + '=' + quote_plus(str(elt)))
597 return '&'.join(l)
598
599# Utilities to parse URLs (most of these return None for missing parts):
600# unwrap('<URL:type://host/path>') --> 'type://host/path'
601# splittype('type:opaquestring') --> 'type', 'opaquestring'
602# splithost('//host[:port]/path') --> 'host[:port]', '/path'
603# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
604# splitpasswd('user:passwd') -> 'user', 'passwd'
605# splitport('host:port') --> 'host', 'port'
606# splitquery('/path?query') --> '/path', 'query'
607# splittag('/path#tag') --> '/path', 'tag'
608# splitattr('/path;attr1=value1;attr2=value2;...') ->
609# '/path', ['attr1=value1', 'attr2=value2', ...]
610# splitvalue('attr=value') --> 'attr', 'value'
611# urllib.parse.unquote('abc%20def') -> 'abc def'
612# quote('abc def') -> 'abc%20def')
613
Georg Brandl13e89462008-07-01 19:56:00 +0000614def to_bytes(url):
615 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000616 # Most URL schemes require ASCII. If that changes, the conversion
617 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000618 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000619 if isinstance(url, str):
620 try:
621 url = url.encode("ASCII").decode()
622 except UnicodeError:
623 raise UnicodeError("URL " + repr(url) +
624 " contains non-ASCII characters")
625 return url
626
627def unwrap(url):
628 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
629 url = str(url).strip()
630 if url[:1] == '<' and url[-1:] == '>':
631 url = url[1:-1].strip()
632 if url[:4] == 'URL:': url = url[4:].strip()
633 return url
634
635_typeprog = None
636def splittype(url):
637 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
638 global _typeprog
639 if _typeprog is None:
640 import re
641 _typeprog = re.compile('^([^/:]+):')
642
643 match = _typeprog.match(url)
644 if match:
645 scheme = match.group(1)
646 return scheme.lower(), url[len(scheme) + 1:]
647 return None, url
648
649_hostprog = None
650def splithost(url):
651 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
652 global _hostprog
653 if _hostprog is None:
654 import re
655 _hostprog = re.compile('^//([^/?]*)(.*)$')
656
657 match = _hostprog.match(url)
658 if match: return match.group(1, 2)
659 return None, url
660
661_userprog = None
662def splituser(host):
663 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
664 global _userprog
665 if _userprog is None:
666 import re
667 _userprog = re.compile('^(.*)@(.*)$')
668
669 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000670 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000671 return None, host
672
673_passwdprog = None
674def splitpasswd(user):
675 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
676 global _passwdprog
677 if _passwdprog is None:
678 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000679 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000680
681 match = _passwdprog.match(user)
682 if match: return match.group(1, 2)
683 return user, None
684
685# splittag('/path#tag') --> '/path', 'tag'
686_portprog = None
687def splitport(host):
688 """splitport('host:port') --> 'host', 'port'."""
689 global _portprog
690 if _portprog is None:
691 import re
692 _portprog = re.compile('^(.*):([0-9]+)$')
693
694 match = _portprog.match(host)
695 if match: return match.group(1, 2)
696 return host, None
697
698_nportprog = None
699def splitnport(host, defport=-1):
700 """Split host and port, returning numeric port.
701 Return given default port if no ':' found; defaults to -1.
702 Return numerical port if a valid number are found after ':'.
703 Return None if ':' but not a valid number."""
704 global _nportprog
705 if _nportprog is None:
706 import re
707 _nportprog = re.compile('^(.*):(.*)$')
708
709 match = _nportprog.match(host)
710 if match:
711 host, port = match.group(1, 2)
712 try:
713 if not port: raise ValueError("no digits")
714 nport = int(port)
715 except ValueError:
716 nport = None
717 return host, nport
718 return host, defport
719
720_queryprog = None
721def splitquery(url):
722 """splitquery('/path?query') --> '/path', 'query'."""
723 global _queryprog
724 if _queryprog is None:
725 import re
726 _queryprog = re.compile('^(.*)\?([^?]*)$')
727
728 match = _queryprog.match(url)
729 if match: return match.group(1, 2)
730 return url, None
731
732_tagprog = None
733def splittag(url):
734 """splittag('/path#tag') --> '/path', 'tag'."""
735 global _tagprog
736 if _tagprog is None:
737 import re
738 _tagprog = re.compile('^(.*)#([^#]*)$')
739
740 match = _tagprog.match(url)
741 if match: return match.group(1, 2)
742 return url, None
743
744def splitattr(url):
745 """splitattr('/path;attr1=value1;attr2=value2;...') ->
746 '/path', ['attr1=value1', 'attr2=value2', ...]."""
747 words = url.split(';')
748 return words[0], words[1:]
749
750_valueprog = None
751def splitvalue(attr):
752 """splitvalue('attr=value') --> 'attr', 'value'."""
753 global _valueprog
754 if _valueprog is None:
755 import re
756 _valueprog = re.compile('^([^=]*)=(.*)$')
757
758 match = _valueprog.match(attr)
759 if match: return match.group(1, 2)
760 return attr, None
761
762test_input = """
763 http://a/b/c/d
764
765 g:h = <URL:g:h>
766 http:g = <URL:http://a/b/c/g>
767 http: = <URL:http://a/b/c/d>
768 g = <URL:http://a/b/c/g>
769 ./g = <URL:http://a/b/c/g>
770 g/ = <URL:http://a/b/c/g/>
771 /g = <URL:http://a/g>
772 //g = <URL:http://g>
773 ?y = <URL:http://a/b/c/d?y>
774 g?y = <URL:http://a/b/c/g?y>
775 g?y/./x = <URL:http://a/b/c/g?y/./x>
776 . = <URL:http://a/b/c/>
777 ./ = <URL:http://a/b/c/>
778 .. = <URL:http://a/b/>
779 ../ = <URL:http://a/b/>
780 ../g = <URL:http://a/b/g>
781 ../.. = <URL:http://a/>
782 ../../g = <URL:http://a/g>
783 ../../../g = <URL:http://a/../g>
784 ./../g = <URL:http://a/b/g>
785 ./g/. = <URL:http://a/b/c/g/>
786 /./g = <URL:http://a/./g>
787 g/./h = <URL:http://a/b/c/g/h>
788 g/../h = <URL:http://a/b/c/h>
789 http:g = <URL:http://a/b/c/g>
790 http: = <URL:http://a/b/c/d>
791 http:?y = <URL:http://a/b/c/d?y>
792 http:g?y = <URL:http://a/b/c/g?y>
793 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
794"""
795
796def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000797 base = ''
798 if sys.argv[1:]:
799 fn = sys.argv[1]
800 if fn == '-':
801 fp = sys.stdin
802 else:
803 fp = open(fn)
804 else:
805 from io import StringIO
806 fp = StringIO(test_input)
807 for line in fp:
808 words = line.split()
809 if not words:
810 continue
811 url = words[0]
812 parts = urlparse(url)
813 print('%-10s : %s' % (url, parts))
814 abs = urljoin(base, url)
815 if not base:
816 base = abs
817 wrapped = '<URL:%s>' % abs
818 print('%-10s = %s' % (url, wrapped))
819 if len(words) == 3 and words[1] == '=':
820 if wrapped != words[2]:
821 print('EXPECTED', words[2], '!!!!!!!!!!')
822
823if __name__ == '__main__':
824 test()