blob: b39fc25eb845c830573d6616c3a3d7782806e25d [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any changes to urlparse module
23should conform to this. urlparse module is not entirely compliant with this.
24The defacto scenarios of parsing are considered sometimes and for backward
25compatiblity purposes, older RFC uses of parsing are retained. The testcases in
26test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000027"""
28
Facundo Batista2ac5de22008-07-07 18:24:11 +000029import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000030import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000031
Jeremy Hylton1afc1692008-06-18 20:49:58 +000032__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000033 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000034 "quote", "quote_plus", "quote_from_bytes",
35 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37# A classification of schemes ('' means apply by default)
38uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
39 'wais', 'file', 'https', 'shttp', 'mms',
40 'prospero', 'rtsp', 'rtspu', '', 'sftp']
41uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
42 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
43 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000044 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000045non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
46 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
47uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
48 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
49 'mms', '', 'sftp']
50uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
51 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
52uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
53 'nntp', 'wais', 'https', 'shttp', 'snews',
54 'file', 'prospero', '']
55
56# Characters valid in scheme names
57scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
58 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
59 '0123456789'
60 '+-.')
61
62MAX_CACHE_SIZE = 20
63_parse_cache = {}
64
65def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000066 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000067 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000068 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000069
70
71class ResultMixin(object):
72 """Shared methods for the parsed result objects."""
73
74 @property
75 def username(self):
76 netloc = self.netloc
77 if "@" in netloc:
78 userinfo = netloc.rsplit("@", 1)[0]
79 if ":" in userinfo:
80 userinfo = userinfo.split(":", 1)[0]
81 return userinfo
82 return None
83
84 @property
85 def password(self):
86 netloc = self.netloc
87 if "@" in netloc:
88 userinfo = netloc.rsplit("@", 1)[0]
89 if ":" in userinfo:
90 return userinfo.split(":", 1)[1]
91 return None
92
93 @property
94 def hostname(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +000095 netloc = self.netloc.split('@')[-1]
96 if '[' in netloc and ']' in netloc:
97 return netloc.split(']')[0][1:].lower()
Senthil Kumaranad02d232010-04-16 03:02:13 +000098 elif ':' in netloc:
99 return netloc.split(':')[0].lower()
100 elif netloc == '':
101 return None
102 else:
103 return netloc.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000104
105 @property
106 def port(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +0000107 netloc = self.netloc.split('@')[-1].split(']')[-1]
108 if ':' in netloc:
109 port = netloc.split(':')[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110 return int(port, 10)
Senthil Kumaranad02d232010-04-16 03:02:13 +0000111 else:
112 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114from collections import namedtuple
115
116class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
117
118 __slots__ = ()
119
120 def geturl(self):
121 return urlunsplit(self)
122
123
124class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
125
126 __slots__ = ()
127
128 def geturl(self):
129 return urlunparse(self)
130
131
132def urlparse(url, scheme='', allow_fragments=True):
133 """Parse a URL into 6 components:
134 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
135 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
136 Note that we don't break the components up in smaller bits
137 (e.g. netloc is a single string) and we don't expand % escapes."""
138 tuple = urlsplit(url, scheme, allow_fragments)
139 scheme, netloc, url, query, fragment = tuple
140 if scheme in uses_params and ';' in url:
141 url, params = _splitparams(url)
142 else:
143 params = ''
144 return ParseResult(scheme, netloc, url, params, query, fragment)
145
146def _splitparams(url):
147 if '/' in url:
148 i = url.find(';', url.rfind('/'))
149 if i < 0:
150 return url, ''
151 else:
152 i = url.find(';')
153 return url[:i], url[i+1:]
154
155def _splitnetloc(url, start=0):
156 delim = len(url) # position of end of domain part of url, default is end
157 for c in '/?#': # look for delimiters; the order is NOT important
158 wdelim = url.find(c, start) # find first of this delim
159 if wdelim >= 0: # if found
160 delim = min(delim, wdelim) # use earliest delim position
161 return url[start:delim], url[delim:] # return (domain, rest)
162
163def urlsplit(url, scheme='', allow_fragments=True):
164 """Parse a URL into 5 components:
165 <scheme>://<netloc>/<path>?<query>#<fragment>
166 Return a 5-tuple: (scheme, netloc, path, query, fragment).
167 Note that we don't break the components up in smaller bits
168 (e.g. netloc is a single string) and we don't expand % escapes."""
169 allow_fragments = bool(allow_fragments)
170 key = url, scheme, allow_fragments, type(url), type(scheme)
171 cached = _parse_cache.get(key, None)
172 if cached:
173 return cached
174 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
175 clear_cache()
176 netloc = query = fragment = ''
177 i = url.find(':')
178 if i > 0:
179 if url[:i] == 'http': # optimize the common case
180 scheme = url[:i].lower()
181 url = url[i+1:]
182 if url[:2] == '//':
183 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000184 if (('[' in netloc and ']' not in netloc) or
185 (']' in netloc and '[' not in netloc)):
186 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000187 if allow_fragments and '#' in url:
188 url, fragment = url.split('#', 1)
189 if '?' in url:
190 url, query = url.split('?', 1)
191 v = SplitResult(scheme, netloc, url, query, fragment)
192 _parse_cache[key] = v
193 return v
194 for c in url[:i]:
195 if c not in scheme_chars:
196 break
197 else:
198 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000199 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000200 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000201 if (('[' in netloc and ']' not in netloc) or
202 (']' in netloc and '[' not in netloc)):
203 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000204 if allow_fragments and scheme in uses_fragment and '#' in url:
205 url, fragment = url.split('#', 1)
206 if scheme in uses_query and '?' in url:
207 url, query = url.split('?', 1)
208 v = SplitResult(scheme, netloc, url, query, fragment)
209 _parse_cache[key] = v
210 return v
211
212def urlunparse(components):
213 """Put a parsed URL back together again. This may result in a
214 slightly different, but equivalent URL, if the URL that was parsed
215 originally had redundant delimiters, e.g. a ? with an empty query
216 (the draft states that these are equivalent)."""
217 scheme, netloc, url, params, query, fragment = components
218 if params:
219 url = "%s;%s" % (url, params)
220 return urlunsplit((scheme, netloc, url, query, fragment))
221
222def urlunsplit(components):
223 scheme, netloc, url, query, fragment = components
224 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
225 if url and url[:1] != '/': url = '/' + url
226 url = '//' + (netloc or '') + url
227 if scheme:
228 url = scheme + ':' + url
229 if query:
230 url = url + '?' + query
231 if fragment:
232 url = url + '#' + fragment
233 return url
234
235def urljoin(base, url, allow_fragments=True):
236 """Join a base URL and a possibly relative URL to form an absolute
237 interpretation of the latter."""
238 if not base:
239 return url
240 if not url:
241 return base
242 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
243 urlparse(base, '', allow_fragments)
244 scheme, netloc, path, params, query, fragment = \
245 urlparse(url, bscheme, allow_fragments)
246 if scheme != bscheme or scheme not in uses_relative:
247 return url
248 if scheme in uses_netloc:
249 if netloc:
250 return urlunparse((scheme, netloc, path,
251 params, query, fragment))
252 netloc = bnetloc
253 if path[:1] == '/':
254 return urlunparse((scheme, netloc, path,
255 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000256 if not path:
257 path = bpath
258 if not params:
259 params = bparams
260 else:
261 path = path[:-1]
262 return urlunparse((scheme, netloc, path,
263 params, query, fragment))
264 if not query:
265 query = bquery
266 return urlunparse((scheme, netloc, path,
267 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000268 segments = bpath.split('/')[:-1] + path.split('/')
269 # XXX The stuff below is bogus in various ways...
270 if segments[-1] == '.':
271 segments[-1] = ''
272 while '.' in segments:
273 segments.remove('.')
274 while 1:
275 i = 1
276 n = len(segments) - 1
277 while i < n:
278 if (segments[i] == '..'
279 and segments[i-1] not in ('', '..')):
280 del segments[i-1:i+1]
281 break
282 i = i+1
283 else:
284 break
285 if segments == ['', '..']:
286 segments[-1] = ''
287 elif len(segments) >= 2 and segments[-1] == '..':
288 segments[-2:] = ['']
289 return urlunparse((scheme, netloc, '/'.join(segments),
290 params, query, fragment))
291
292def urldefrag(url):
293 """Removes any existing fragment from URL.
294
295 Returns a tuple of the defragmented URL and the fragment. If
296 the URL contained no fragments, the second element is the
297 empty string.
298 """
299 if '#' in url:
300 s, n, p, a, q, frag = urlparse(url)
301 defrag = urlunparse((s, n, p, a, q, ''))
302 return defrag, frag
303 else:
304 return url, ''
305
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000306def unquote_to_bytes(string):
307 """unquote_to_bytes('abc%20def') -> b'abc def'."""
308 # Note: strings are encoded as UTF-8. This is only an issue if it contains
309 # unescaped non-ASCII characters, which URIs should not.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000310 if not string:
311 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000312 if isinstance(string, str):
313 string = string.encode('utf-8')
314 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000315 if len(res) == 1:
316 return string
317 string = res[0]
318 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000319 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000320 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000321 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000322 string += b'%' + item
323 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000324
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000325def unquote(string, encoding='utf-8', errors='replace'):
326 """Replace %xx escapes by their single-character equivalent. The optional
327 encoding and errors parameters specify how to decode percent-encoded
328 sequences into Unicode characters, as accepted by the bytes.decode()
329 method.
330 By default, percent-encoded sequences are decoded with UTF-8, and invalid
331 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000332
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000333 unquote('abc%20def') -> 'abc def'.
334 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000335 if not string:
336 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000337 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000338 if len(res) == 1:
339 return string
340 if encoding is None:
341 encoding = 'utf-8'
342 if errors is None:
343 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000344 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000345 pct_sequence = b''
346 string = res[0]
347 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000348 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000349 if not item:
350 raise ValueError
351 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000352 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000353 if not rest:
354 # This segment was just a single percent-encoded character.
355 # May be part of a sequence of code units, so delay decoding.
356 # (Stored in pct_sequence).
357 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000358 except ValueError:
359 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000360 # Encountered non-percent-encoded characters. Flush the current
361 # pct_sequence.
362 string += pct_sequence.decode(encoding, errors) + rest
363 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000364 if pct_sequence:
365 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000366 string += pct_sequence.decode(encoding, errors)
367 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000368
Georg Brandl3d6575d2009-09-16 14:36:22 +0000369def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000370 """Parse a query given as a string argument.
371
372 Arguments:
373
374 qs: URL-encoded query string to be parsed
375
376 keep_blank_values: flag indicating whether blank values in
377 URL encoded queries should be treated as blank strings.
378 A true value indicates that blanks should be retained as
379 blank strings. The default false value indicates that
380 blank values are to be ignored and treated as if they were
381 not included.
382
383 strict_parsing: flag indicating what to do with parsing errors.
384 If false (the default), errors are silently ignored.
385 If true, errors raise a ValueError exception.
386 """
387 dict = {}
388 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
389 if name in dict:
390 dict[name].append(value)
391 else:
392 dict[name] = [value]
393 return dict
394
Georg Brandl3d6575d2009-09-16 14:36:22 +0000395def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000396 """Parse a query given as a string argument.
397
398 Arguments:
399
400 qs: URL-encoded query string to be parsed
401
402 keep_blank_values: flag indicating whether blank values in
403 URL encoded queries should be treated as blank strings. A
404 true value indicates that blanks should be retained as blank
405 strings. The default false value indicates that blank values
406 are to be ignored and treated as if they were not included.
407
408 strict_parsing: flag indicating what to do with parsing errors. If
409 false (the default), errors are silently ignored. If true,
410 errors raise a ValueError exception.
411
412 Returns a list, as G-d intended.
413 """
414 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
415 r = []
416 for name_value in pairs:
417 if not name_value and not strict_parsing:
418 continue
419 nv = name_value.split('=', 1)
420 if len(nv) != 2:
421 if strict_parsing:
422 raise ValueError("bad query field: %r" % (name_value,))
423 # Handle case of a control-name with no equal sign
424 if keep_blank_values:
425 nv.append('')
426 else:
427 continue
428 if len(nv[1]) or keep_blank_values:
429 name = unquote(nv[0].replace('+', ' '))
430 value = unquote(nv[1].replace('+', ' '))
431 r.append((name, value))
432
433 return r
434
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000435def unquote_plus(string, encoding='utf-8', errors='replace'):
436 """Like unquote(), but also replace plus signs by spaces, as required for
437 unquoting HTML form values.
438
439 unquote_plus('%7e/abc+def') -> '~/abc def'
440 """
441 string = string.replace('+', ' ')
442 return unquote(string, encoding, errors)
443
444_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
445 b'abcdefghijklmnopqrstuvwxyz'
446 b'0123456789'
447 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000448_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
449_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000450
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000451class Quoter(collections.defaultdict):
452 """A mapping from bytes (in range(0,256)) to strings.
453
454 String values are percent-encoded byte values, unless the key < 128, and
455 in the "safe" set (either the specified safe set, or default set).
456 """
457 # Keeps a cache internally, using defaultdict, for efficiency (lookups
458 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000459 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000460 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000461 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000462
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000463 def __repr__(self):
464 # Without this, will just display as a defaultdict
465 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000466
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000467 def __missing__(self, b):
468 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000469 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000470 self[b] = res
471 return res
472
473def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000474 """quote('abc def') -> 'abc%20def'
475
476 Each part of a URL, e.g. the path info, the query, etc., has a
477 different set of reserved characters that must be quoted.
478
479 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
480 the following reserved characters.
481
482 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
483 "$" | ","
484
485 Each of these characters is reserved in some component of a URL,
486 but not necessarily in all of them.
487
488 By default, the quote function is intended for quoting the path
489 section of a URL. Thus, it will not encode '/'. This character
490 is reserved, but in typical usage the quote function is being
491 called on a path where the existing slash characters are used as
492 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000493
494 string and safe may be either str or bytes objects. encoding must
495 not be specified if string is a str.
496
497 The optional encoding and errors parameters specify how to deal with
498 non-ASCII characters, as accepted by the str.encode method.
499 By default, encoding='utf-8' (characters are encoded with UTF-8), and
500 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000501 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000502 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000503 if not string:
504 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000505 if encoding is None:
506 encoding = 'utf-8'
507 if errors is None:
508 errors = 'strict'
509 string = string.encode(encoding, errors)
510 else:
511 if encoding is not None:
512 raise TypeError("quote() doesn't support 'encoding' for bytes")
513 if errors is not None:
514 raise TypeError("quote() doesn't support 'errors' for bytes")
515 return quote_from_bytes(string, safe)
516
517def quote_plus(string, safe='', encoding=None, errors=None):
518 """Like quote(), but also replace ' ' with '+', as required for quoting
519 HTML form values. Plus signs in the original string are escaped unless
520 they are included in safe. It also does not have safe default to '/'.
521 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000522 # Check if ' ' in string, where string may either be a str or bytes. If
523 # there are no spaces, the regular quote will produce the right answer.
524 if ((isinstance(string, str) and ' ' not in string) or
525 (isinstance(string, bytes) and b' ' not in string)):
526 return quote(string, safe, encoding, errors)
527 if isinstance(safe, str):
528 space = ' '
529 else:
530 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000531 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000532 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000533
534def quote_from_bytes(bs, safe='/'):
535 """Like quote(), but accepts a bytes object rather than a str, and does
536 not perform string-to-bytes encoding. It always returns an ASCII string.
537 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
538 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000539 if not isinstance(bs, (bytes, bytearray)):
540 raise TypeError("quote_from_bytes() expected bytes")
541 if not bs:
542 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000543 if isinstance(safe, str):
544 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
545 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000546 else:
547 safe = bytes([c for c in safe if c < 128])
548 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
549 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000550 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000551 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000552 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000553 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
554 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000555
Georg Brandl3d6575d2009-09-16 14:36:22 +0000556def urlencode(query, doseq=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000557 """Encode a sequence of two-element tuples or dictionary into a URL query string.
558
559 If any values in the query arg are sequences and doseq is true, each
560 sequence element is converted to a separate parameter.
561
562 If the query arg is a sequence of two-element tuples, the order of the
563 parameters in the output will match the order of parameters in the
564 input.
565 """
566
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000567 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000568 query = query.items()
569 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000570 # It's a bother at times that strings and string-like objects are
571 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000572 try:
573 # non-sequence items should not work with len()
574 # non-empty strings will fail this
575 if len(query) and not isinstance(query[0], tuple):
576 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000577 # Zero-length sequences of all types will get here and succeed,
578 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000579 # allowed empty dicts that type of behavior probably should be
580 # preserved for consistency
581 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000582 ty, va, tb = sys.exc_info()
583 raise TypeError("not a valid non-string sequence "
584 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000585
586 l = []
587 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000588 for k, v in query:
589 k = quote_plus(str(k))
590 v = quote_plus(str(v))
591 l.append(k + '=' + v)
592 else:
593 for k, v in query:
594 k = quote_plus(str(k))
595 if isinstance(v, str):
596 v = quote_plus(v)
597 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000598 else:
599 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000600 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000601 x = len(v)
602 except TypeError:
603 # not a sequence
604 v = quote_plus(str(v))
605 l.append(k + '=' + v)
606 else:
607 # loop over the sequence
608 for elt in v:
609 l.append(k + '=' + quote_plus(str(elt)))
610 return '&'.join(l)
611
612# Utilities to parse URLs (most of these return None for missing parts):
613# unwrap('<URL:type://host/path>') --> 'type://host/path'
614# splittype('type:opaquestring') --> 'type', 'opaquestring'
615# splithost('//host[:port]/path') --> 'host[:port]', '/path'
616# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
617# splitpasswd('user:passwd') -> 'user', 'passwd'
618# splitport('host:port') --> 'host', 'port'
619# splitquery('/path?query') --> '/path', 'query'
620# splittag('/path#tag') --> '/path', 'tag'
621# splitattr('/path;attr1=value1;attr2=value2;...') ->
622# '/path', ['attr1=value1', 'attr2=value2', ...]
623# splitvalue('attr=value') --> 'attr', 'value'
624# urllib.parse.unquote('abc%20def') -> 'abc def'
625# quote('abc def') -> 'abc%20def')
626
Georg Brandl13e89462008-07-01 19:56:00 +0000627def to_bytes(url):
628 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000629 # Most URL schemes require ASCII. If that changes, the conversion
630 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000631 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000632 if isinstance(url, str):
633 try:
634 url = url.encode("ASCII").decode()
635 except UnicodeError:
636 raise UnicodeError("URL " + repr(url) +
637 " contains non-ASCII characters")
638 return url
639
640def unwrap(url):
641 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
642 url = str(url).strip()
643 if url[:1] == '<' and url[-1:] == '>':
644 url = url[1:-1].strip()
645 if url[:4] == 'URL:': url = url[4:].strip()
646 return url
647
648_typeprog = None
649def splittype(url):
650 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
651 global _typeprog
652 if _typeprog is None:
653 import re
654 _typeprog = re.compile('^([^/:]+):')
655
656 match = _typeprog.match(url)
657 if match:
658 scheme = match.group(1)
659 return scheme.lower(), url[len(scheme) + 1:]
660 return None, url
661
662_hostprog = None
663def splithost(url):
664 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
665 global _hostprog
666 if _hostprog is None:
667 import re
668 _hostprog = re.compile('^//([^/?]*)(.*)$')
669
670 match = _hostprog.match(url)
671 if match: return match.group(1, 2)
672 return None, url
673
674_userprog = None
675def splituser(host):
676 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
677 global _userprog
678 if _userprog is None:
679 import re
680 _userprog = re.compile('^(.*)@(.*)$')
681
682 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000683 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000684 return None, host
685
686_passwdprog = None
687def splitpasswd(user):
688 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
689 global _passwdprog
690 if _passwdprog is None:
691 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000692 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000693
694 match = _passwdprog.match(user)
695 if match: return match.group(1, 2)
696 return user, None
697
698# splittag('/path#tag') --> '/path', 'tag'
699_portprog = None
700def splitport(host):
701 """splitport('host:port') --> 'host', 'port'."""
702 global _portprog
703 if _portprog is None:
704 import re
705 _portprog = re.compile('^(.*):([0-9]+)$')
706
707 match = _portprog.match(host)
708 if match: return match.group(1, 2)
709 return host, None
710
711_nportprog = None
712def splitnport(host, defport=-1):
713 """Split host and port, returning numeric port.
714 Return given default port if no ':' found; defaults to -1.
715 Return numerical port if a valid number are found after ':'.
716 Return None if ':' but not a valid number."""
717 global _nportprog
718 if _nportprog is None:
719 import re
720 _nportprog = re.compile('^(.*):(.*)$')
721
722 match = _nportprog.match(host)
723 if match:
724 host, port = match.group(1, 2)
725 try:
726 if not port: raise ValueError("no digits")
727 nport = int(port)
728 except ValueError:
729 nport = None
730 return host, nport
731 return host, defport
732
733_queryprog = None
734def splitquery(url):
735 """splitquery('/path?query') --> '/path', 'query'."""
736 global _queryprog
737 if _queryprog is None:
738 import re
739 _queryprog = re.compile('^(.*)\?([^?]*)$')
740
741 match = _queryprog.match(url)
742 if match: return match.group(1, 2)
743 return url, None
744
745_tagprog = None
746def splittag(url):
747 """splittag('/path#tag') --> '/path', 'tag'."""
748 global _tagprog
749 if _tagprog is None:
750 import re
751 _tagprog = re.compile('^(.*)#([^#]*)$')
752
753 match = _tagprog.match(url)
754 if match: return match.group(1, 2)
755 return url, None
756
757def splitattr(url):
758 """splitattr('/path;attr1=value1;attr2=value2;...') ->
759 '/path', ['attr1=value1', 'attr2=value2', ...]."""
760 words = url.split(';')
761 return words[0], words[1:]
762
763_valueprog = None
764def splitvalue(attr):
765 """splitvalue('attr=value') --> 'attr', 'value'."""
766 global _valueprog
767 if _valueprog is None:
768 import re
769 _valueprog = re.compile('^([^=]*)=(.*)$')
770
771 match = _valueprog.match(attr)
772 if match: return match.group(1, 2)
773 return attr, None
774
775test_input = """
776 http://a/b/c/d
777
778 g:h = <URL:g:h>
779 http:g = <URL:http://a/b/c/g>
780 http: = <URL:http://a/b/c/d>
781 g = <URL:http://a/b/c/g>
782 ./g = <URL:http://a/b/c/g>
783 g/ = <URL:http://a/b/c/g/>
784 /g = <URL:http://a/g>
785 //g = <URL:http://g>
786 ?y = <URL:http://a/b/c/d?y>
787 g?y = <URL:http://a/b/c/g?y>
788 g?y/./x = <URL:http://a/b/c/g?y/./x>
789 . = <URL:http://a/b/c/>
790 ./ = <URL:http://a/b/c/>
791 .. = <URL:http://a/b/>
792 ../ = <URL:http://a/b/>
793 ../g = <URL:http://a/b/g>
794 ../.. = <URL:http://a/>
795 ../../g = <URL:http://a/g>
796 ../../../g = <URL:http://a/../g>
797 ./../g = <URL:http://a/b/g>
798 ./g/. = <URL:http://a/b/c/g/>
799 /./g = <URL:http://a/./g>
800 g/./h = <URL:http://a/b/c/g/h>
801 g/../h = <URL:http://a/b/c/h>
802 http:g = <URL:http://a/b/c/g>
803 http: = <URL:http://a/b/c/d>
804 http:?y = <URL:http://a/b/c/d?y>
805 http:g?y = <URL:http://a/b/c/g?y>
806 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
807"""
808
809def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000810 base = ''
811 if sys.argv[1:]:
812 fn = sys.argv[1]
813 if fn == '-':
814 fp = sys.stdin
815 else:
816 fp = open(fn)
817 else:
818 from io import StringIO
819 fp = StringIO(test_input)
820 for line in fp:
821 words = line.split()
822 if not words:
823 continue
824 url = words[0]
825 parts = urlparse(url)
826 print('%-10s : %s' % (url, parts))
827 abs = urljoin(base, url)
828 if not base:
829 base = abs
830 wrapped = '<URL:%s>' % abs
831 print('%-10s = %s' % (url, wrapped))
832 if len(words) == 3 and words[1] == '=':
833 if wrapped != words[2]:
834 print('EXPECTED', words[2], '!!!!!!!!!!')
835
836if __name__ == '__main__':
837 test()