blob: ffb0ff72dfe46e56f3292c4101c7093e6803b7fc [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000034 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000035 "quote", "quote_plus", "quote_from_bytes",
36 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
41 'prospero', 'rtsp', 'rtspu', '', 'sftp']
42uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
43 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
44 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000045 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
47 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
48uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
49 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
50 'mms', '', 'sftp']
51uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
52 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
53uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
54 'nntp', 'wais', 'https', 'shttp', 'snews',
55 'file', 'prospero', '']
56
57# Characters valid in scheme names
58scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
59 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
60 '0123456789'
61 '+-.')
62
63MAX_CACHE_SIZE = 20
64_parse_cache = {}
65
66def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000067 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000069 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000070
71
72class ResultMixin(object):
73 """Shared methods for the parsed result objects."""
74
75 @property
76 def username(self):
77 netloc = self.netloc
78 if "@" in netloc:
79 userinfo = netloc.rsplit("@", 1)[0]
80 if ":" in userinfo:
81 userinfo = userinfo.split(":", 1)[0]
82 return userinfo
83 return None
84
85 @property
86 def password(self):
87 netloc = self.netloc
88 if "@" in netloc:
89 userinfo = netloc.rsplit("@", 1)[0]
90 if ":" in userinfo:
91 return userinfo.split(":", 1)[1]
92 return None
93
94 @property
95 def hostname(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +000096 netloc = self.netloc.split('@')[-1]
97 if '[' in netloc and ']' in netloc:
98 return netloc.split(']')[0][1:].lower()
Senthil Kumaranad02d232010-04-16 03:02:13 +000099 elif ':' in netloc:
100 return netloc.split(':')[0].lower()
101 elif netloc == '':
102 return None
103 else:
104 return netloc.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106 @property
107 def port(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +0000108 netloc = self.netloc.split('@')[-1].split(']')[-1]
109 if ':' in netloc:
110 port = netloc.split(':')[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000111 return int(port, 10)
Senthil Kumaranad02d232010-04-16 03:02:13 +0000112 else:
113 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000114
115from collections import namedtuple
116
117class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
118
119 __slots__ = ()
120
121 def geturl(self):
122 return urlunsplit(self)
123
124
125class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
126
127 __slots__ = ()
128
129 def geturl(self):
130 return urlunparse(self)
131
132
133def urlparse(url, scheme='', allow_fragments=True):
134 """Parse a URL into 6 components:
135 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
136 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
139 tuple = urlsplit(url, scheme, allow_fragments)
140 scheme, netloc, url, query, fragment = tuple
141 if scheme in uses_params and ';' in url:
142 url, params = _splitparams(url)
143 else:
144 params = ''
145 return ParseResult(scheme, netloc, url, params, query, fragment)
146
147def _splitparams(url):
148 if '/' in url:
149 i = url.find(';', url.rfind('/'))
150 if i < 0:
151 return url, ''
152 else:
153 i = url.find(';')
154 return url[:i], url[i+1:]
155
156def _splitnetloc(url, start=0):
157 delim = len(url) # position of end of domain part of url, default is end
158 for c in '/?#': # look for delimiters; the order is NOT important
159 wdelim = url.find(c, start) # find first of this delim
160 if wdelim >= 0: # if found
161 delim = min(delim, wdelim) # use earliest delim position
162 return url[start:delim], url[delim:] # return (domain, rest)
163
164def urlsplit(url, scheme='', allow_fragments=True):
165 """Parse a URL into 5 components:
166 <scheme>://<netloc>/<path>?<query>#<fragment>
167 Return a 5-tuple: (scheme, netloc, path, query, fragment).
168 Note that we don't break the components up in smaller bits
169 (e.g. netloc is a single string) and we don't expand % escapes."""
170 allow_fragments = bool(allow_fragments)
171 key = url, scheme, allow_fragments, type(url), type(scheme)
172 cached = _parse_cache.get(key, None)
173 if cached:
174 return cached
175 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
176 clear_cache()
177 netloc = query = fragment = ''
178 i = url.find(':')
179 if i > 0:
180 if url[:i] == 'http': # optimize the common case
181 scheme = url[:i].lower()
182 url = url[i+1:]
183 if url[:2] == '//':
184 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000185 if (('[' in netloc and ']' not in netloc) or
186 (']' in netloc and '[' not in netloc)):
187 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 if allow_fragments and '#' in url:
189 url, fragment = url.split('#', 1)
190 if '?' in url:
191 url, query = url.split('?', 1)
192 v = SplitResult(scheme, netloc, url, query, fragment)
193 _parse_cache[key] = v
194 return v
195 for c in url[:i]:
196 if c not in scheme_chars:
197 break
198 else:
199 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000200 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000201 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000202 if (('[' in netloc and ']' not in netloc) or
203 (']' in netloc and '[' not in netloc)):
204 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000205 if allow_fragments and scheme in uses_fragment and '#' in url:
206 url, fragment = url.split('#', 1)
207 if scheme in uses_query and '?' in url:
208 url, query = url.split('?', 1)
209 v = SplitResult(scheme, netloc, url, query, fragment)
210 _parse_cache[key] = v
211 return v
212
213def urlunparse(components):
214 """Put a parsed URL back together again. This may result in a
215 slightly different, but equivalent URL, if the URL that was parsed
216 originally had redundant delimiters, e.g. a ? with an empty query
217 (the draft states that these are equivalent)."""
218 scheme, netloc, url, params, query, fragment = components
219 if params:
220 url = "%s;%s" % (url, params)
221 return urlunsplit((scheme, netloc, url, query, fragment))
222
223def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000224 """Combine the elements of a tuple as returned by urlsplit() into a
225 complete URL as a string. The data argument can be any five-item iterable.
226 This may result in a slightly different, but equivalent URL, if the URL that
227 was parsed originally had unnecessary delimiters (for example, a ? with an
228 empty query; the RFC states that these are equivalent)."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000229 scheme, netloc, url, query, fragment = components
230 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
231 if url and url[:1] != '/': url = '/' + url
232 url = '//' + (netloc or '') + url
233 if scheme:
234 url = scheme + ':' + url
235 if query:
236 url = url + '?' + query
237 if fragment:
238 url = url + '#' + fragment
239 return url
240
241def urljoin(base, url, allow_fragments=True):
242 """Join a base URL and a possibly relative URL to form an absolute
243 interpretation of the latter."""
244 if not base:
245 return url
246 if not url:
247 return base
248 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
249 urlparse(base, '', allow_fragments)
250 scheme, netloc, path, params, query, fragment = \
251 urlparse(url, bscheme, allow_fragments)
252 if scheme != bscheme or scheme not in uses_relative:
253 return url
254 if scheme in uses_netloc:
255 if netloc:
256 return urlunparse((scheme, netloc, path,
257 params, query, fragment))
258 netloc = bnetloc
259 if path[:1] == '/':
260 return urlunparse((scheme, netloc, path,
261 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000262 if not path:
263 path = bpath
264 if not params:
265 params = bparams
266 else:
267 path = path[:-1]
268 return urlunparse((scheme, netloc, path,
269 params, query, fragment))
270 if not query:
271 query = bquery
272 return urlunparse((scheme, netloc, path,
273 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000274 segments = bpath.split('/')[:-1] + path.split('/')
275 # XXX The stuff below is bogus in various ways...
276 if segments[-1] == '.':
277 segments[-1] = ''
278 while '.' in segments:
279 segments.remove('.')
280 while 1:
281 i = 1
282 n = len(segments) - 1
283 while i < n:
284 if (segments[i] == '..'
285 and segments[i-1] not in ('', '..')):
286 del segments[i-1:i+1]
287 break
288 i = i+1
289 else:
290 break
291 if segments == ['', '..']:
292 segments[-1] = ''
293 elif len(segments) >= 2 and segments[-1] == '..':
294 segments[-2:] = ['']
295 return urlunparse((scheme, netloc, '/'.join(segments),
296 params, query, fragment))
297
298def urldefrag(url):
299 """Removes any existing fragment from URL.
300
301 Returns a tuple of the defragmented URL and the fragment. If
302 the URL contained no fragments, the second element is the
303 empty string.
304 """
305 if '#' in url:
306 s, n, p, a, q, frag = urlparse(url)
307 defrag = urlunparse((s, n, p, a, q, ''))
308 return defrag, frag
309 else:
310 return url, ''
311
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000312def unquote_to_bytes(string):
313 """unquote_to_bytes('abc%20def') -> b'abc def'."""
314 # Note: strings are encoded as UTF-8. This is only an issue if it contains
315 # unescaped non-ASCII characters, which URIs should not.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000316 if not string:
317 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000318 if isinstance(string, str):
319 string = string.encode('utf-8')
320 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000321 if len(res) == 1:
322 return string
323 string = res[0]
324 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000325 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000326 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000327 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000328 string += b'%' + item
329 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000331def unquote(string, encoding='utf-8', errors='replace'):
332 """Replace %xx escapes by their single-character equivalent. The optional
333 encoding and errors parameters specify how to decode percent-encoded
334 sequences into Unicode characters, as accepted by the bytes.decode()
335 method.
336 By default, percent-encoded sequences are decoded with UTF-8, and invalid
337 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000338
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000339 unquote('abc%20def') -> 'abc def'.
340 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000341 if not string:
342 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000343 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000344 if len(res) == 1:
345 return string
346 if encoding is None:
347 encoding = 'utf-8'
348 if errors is None:
349 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000350 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000351 pct_sequence = b''
352 string = res[0]
353 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000354 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000355 if not item:
356 raise ValueError
357 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000358 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000359 if not rest:
360 # This segment was just a single percent-encoded character.
361 # May be part of a sequence of code units, so delay decoding.
362 # (Stored in pct_sequence).
363 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000364 except ValueError:
365 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000366 # Encountered non-percent-encoded characters. Flush the current
367 # pct_sequence.
368 string += pct_sequence.decode(encoding, errors) + rest
369 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000370 if pct_sequence:
371 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000372 string += pct_sequence.decode(encoding, errors)
373 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000374
Georg Brandl3d6575d2009-09-16 14:36:22 +0000375def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000376 """Parse a query given as a string argument.
377
378 Arguments:
379
380 qs: URL-encoded query string to be parsed
381
382 keep_blank_values: flag indicating whether blank values in
383 URL encoded queries should be treated as blank strings.
384 A true value indicates that blanks should be retained as
385 blank strings. The default false value indicates that
386 blank values are to be ignored and treated as if they were
387 not included.
388
389 strict_parsing: flag indicating what to do with parsing errors.
390 If false (the default), errors are silently ignored.
391 If true, errors raise a ValueError exception.
392 """
393 dict = {}
394 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
395 if name in dict:
396 dict[name].append(value)
397 else:
398 dict[name] = [value]
399 return dict
400
Georg Brandl3d6575d2009-09-16 14:36:22 +0000401def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000402 """Parse a query given as a string argument.
403
404 Arguments:
405
406 qs: URL-encoded query string to be parsed
407
408 keep_blank_values: flag indicating whether blank values in
409 URL encoded queries should be treated as blank strings. A
410 true value indicates that blanks should be retained as blank
411 strings. The default false value indicates that blank values
412 are to be ignored and treated as if they were not included.
413
414 strict_parsing: flag indicating what to do with parsing errors. If
415 false (the default), errors are silently ignored. If true,
416 errors raise a ValueError exception.
417
418 Returns a list, as G-d intended.
419 """
420 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
421 r = []
422 for name_value in pairs:
423 if not name_value and not strict_parsing:
424 continue
425 nv = name_value.split('=', 1)
426 if len(nv) != 2:
427 if strict_parsing:
428 raise ValueError("bad query field: %r" % (name_value,))
429 # Handle case of a control-name with no equal sign
430 if keep_blank_values:
431 nv.append('')
432 else:
433 continue
434 if len(nv[1]) or keep_blank_values:
435 name = unquote(nv[0].replace('+', ' '))
436 value = unquote(nv[1].replace('+', ' '))
437 r.append((name, value))
438
439 return r
440
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000441def unquote_plus(string, encoding='utf-8', errors='replace'):
442 """Like unquote(), but also replace plus signs by spaces, as required for
443 unquoting HTML form values.
444
445 unquote_plus('%7e/abc+def') -> '~/abc def'
446 """
447 string = string.replace('+', ' ')
448 return unquote(string, encoding, errors)
449
450_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
451 b'abcdefghijklmnopqrstuvwxyz'
452 b'0123456789'
453 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000454_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
455_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000456
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000457class Quoter(collections.defaultdict):
458 """A mapping from bytes (in range(0,256)) to strings.
459
460 String values are percent-encoded byte values, unless the key < 128, and
461 in the "safe" set (either the specified safe set, or default set).
462 """
463 # Keeps a cache internally, using defaultdict, for efficiency (lookups
464 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000465 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000466 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000467 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000468
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000469 def __repr__(self):
470 # Without this, will just display as a defaultdict
471 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000472
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000473 def __missing__(self, b):
474 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000475 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000476 self[b] = res
477 return res
478
479def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000480 """quote('abc def') -> 'abc%20def'
481
482 Each part of a URL, e.g. the path info, the query, etc., has a
483 different set of reserved characters that must be quoted.
484
485 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
486 the following reserved characters.
487
488 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
489 "$" | ","
490
491 Each of these characters is reserved in some component of a URL,
492 but not necessarily in all of them.
493
494 By default, the quote function is intended for quoting the path
495 section of a URL. Thus, it will not encode '/'. This character
496 is reserved, but in typical usage the quote function is being
497 called on a path where the existing slash characters are used as
498 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000499
500 string and safe may be either str or bytes objects. encoding must
501 not be specified if string is a str.
502
503 The optional encoding and errors parameters specify how to deal with
504 non-ASCII characters, as accepted by the str.encode method.
505 By default, encoding='utf-8' (characters are encoded with UTF-8), and
506 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000507 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000508 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000509 if not string:
510 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000511 if encoding is None:
512 encoding = 'utf-8'
513 if errors is None:
514 errors = 'strict'
515 string = string.encode(encoding, errors)
516 else:
517 if encoding is not None:
518 raise TypeError("quote() doesn't support 'encoding' for bytes")
519 if errors is not None:
520 raise TypeError("quote() doesn't support 'errors' for bytes")
521 return quote_from_bytes(string, safe)
522
523def quote_plus(string, safe='', encoding=None, errors=None):
524 """Like quote(), but also replace ' ' with '+', as required for quoting
525 HTML form values. Plus signs in the original string are escaped unless
526 they are included in safe. It also does not have safe default to '/'.
527 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000528 # Check if ' ' in string, where string may either be a str or bytes. If
529 # there are no spaces, the regular quote will produce the right answer.
530 if ((isinstance(string, str) and ' ' not in string) or
531 (isinstance(string, bytes) and b' ' not in string)):
532 return quote(string, safe, encoding, errors)
533 if isinstance(safe, str):
534 space = ' '
535 else:
536 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000537 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000538 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000539
540def quote_from_bytes(bs, safe='/'):
541 """Like quote(), but accepts a bytes object rather than a str, and does
542 not perform string-to-bytes encoding. It always returns an ASCII string.
543 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
544 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000545 if not isinstance(bs, (bytes, bytearray)):
546 raise TypeError("quote_from_bytes() expected bytes")
547 if not bs:
548 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000549 if isinstance(safe, str):
550 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
551 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000552 else:
553 safe = bytes([c for c in safe if c < 128])
554 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
555 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000556 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000557 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000558 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000559 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
560 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000561
Georg Brandl3d6575d2009-09-16 14:36:22 +0000562def urlencode(query, doseq=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000563 """Encode a sequence of two-element tuples or dictionary into a URL query string.
564
565 If any values in the query arg are sequences and doseq is true, each
566 sequence element is converted to a separate parameter.
567
568 If the query arg is a sequence of two-element tuples, the order of the
569 parameters in the output will match the order of parameters in the
570 input.
571 """
572
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000573 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000574 query = query.items()
575 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000576 # It's a bother at times that strings and string-like objects are
577 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000578 try:
579 # non-sequence items should not work with len()
580 # non-empty strings will fail this
581 if len(query) and not isinstance(query[0], tuple):
582 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000583 # Zero-length sequences of all types will get here and succeed,
584 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000585 # allowed empty dicts that type of behavior probably should be
586 # preserved for consistency
587 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000588 ty, va, tb = sys.exc_info()
589 raise TypeError("not a valid non-string sequence "
590 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000591
592 l = []
593 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000594 for k, v in query:
595 k = quote_plus(str(k))
596 v = quote_plus(str(v))
597 l.append(k + '=' + v)
598 else:
599 for k, v in query:
600 k = quote_plus(str(k))
601 if isinstance(v, str):
602 v = quote_plus(v)
603 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000604 else:
605 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000606 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000607 x = len(v)
608 except TypeError:
609 # not a sequence
610 v = quote_plus(str(v))
611 l.append(k + '=' + v)
612 else:
613 # loop over the sequence
614 for elt in v:
615 l.append(k + '=' + quote_plus(str(elt)))
616 return '&'.join(l)
617
618# Utilities to parse URLs (most of these return None for missing parts):
619# unwrap('<URL:type://host/path>') --> 'type://host/path'
620# splittype('type:opaquestring') --> 'type', 'opaquestring'
621# splithost('//host[:port]/path') --> 'host[:port]', '/path'
622# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
623# splitpasswd('user:passwd') -> 'user', 'passwd'
624# splitport('host:port') --> 'host', 'port'
625# splitquery('/path?query') --> '/path', 'query'
626# splittag('/path#tag') --> '/path', 'tag'
627# splitattr('/path;attr1=value1;attr2=value2;...') ->
628# '/path', ['attr1=value1', 'attr2=value2', ...]
629# splitvalue('attr=value') --> 'attr', 'value'
630# urllib.parse.unquote('abc%20def') -> 'abc def'
631# quote('abc def') -> 'abc%20def')
632
Georg Brandl13e89462008-07-01 19:56:00 +0000633def to_bytes(url):
634 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000635 # Most URL schemes require ASCII. If that changes, the conversion
636 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000637 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000638 if isinstance(url, str):
639 try:
640 url = url.encode("ASCII").decode()
641 except UnicodeError:
642 raise UnicodeError("URL " + repr(url) +
643 " contains non-ASCII characters")
644 return url
645
646def unwrap(url):
647 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
648 url = str(url).strip()
649 if url[:1] == '<' and url[-1:] == '>':
650 url = url[1:-1].strip()
651 if url[:4] == 'URL:': url = url[4:].strip()
652 return url
653
654_typeprog = None
655def splittype(url):
656 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
657 global _typeprog
658 if _typeprog is None:
659 import re
660 _typeprog = re.compile('^([^/:]+):')
661
662 match = _typeprog.match(url)
663 if match:
664 scheme = match.group(1)
665 return scheme.lower(), url[len(scheme) + 1:]
666 return None, url
667
668_hostprog = None
669def splithost(url):
670 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
671 global _hostprog
672 if _hostprog is None:
673 import re
674 _hostprog = re.compile('^//([^/?]*)(.*)$')
675
676 match = _hostprog.match(url)
677 if match: return match.group(1, 2)
678 return None, url
679
680_userprog = None
681def splituser(host):
682 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
683 global _userprog
684 if _userprog is None:
685 import re
686 _userprog = re.compile('^(.*)@(.*)$')
687
688 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000689 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000690 return None, host
691
692_passwdprog = None
693def splitpasswd(user):
694 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
695 global _passwdprog
696 if _passwdprog is None:
697 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000698 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000699
700 match = _passwdprog.match(user)
701 if match: return match.group(1, 2)
702 return user, None
703
704# splittag('/path#tag') --> '/path', 'tag'
705_portprog = None
706def splitport(host):
707 """splitport('host:port') --> 'host', 'port'."""
708 global _portprog
709 if _portprog is None:
710 import re
711 _portprog = re.compile('^(.*):([0-9]+)$')
712
713 match = _portprog.match(host)
714 if match: return match.group(1, 2)
715 return host, None
716
717_nportprog = None
718def splitnport(host, defport=-1):
719 """Split host and port, returning numeric port.
720 Return given default port if no ':' found; defaults to -1.
721 Return numerical port if a valid number are found after ':'.
722 Return None if ':' but not a valid number."""
723 global _nportprog
724 if _nportprog is None:
725 import re
726 _nportprog = re.compile('^(.*):(.*)$')
727
728 match = _nportprog.match(host)
729 if match:
730 host, port = match.group(1, 2)
731 try:
732 if not port: raise ValueError("no digits")
733 nport = int(port)
734 except ValueError:
735 nport = None
736 return host, nport
737 return host, defport
738
739_queryprog = None
740def splitquery(url):
741 """splitquery('/path?query') --> '/path', 'query'."""
742 global _queryprog
743 if _queryprog is None:
744 import re
745 _queryprog = re.compile('^(.*)\?([^?]*)$')
746
747 match = _queryprog.match(url)
748 if match: return match.group(1, 2)
749 return url, None
750
751_tagprog = None
752def splittag(url):
753 """splittag('/path#tag') --> '/path', 'tag'."""
754 global _tagprog
755 if _tagprog is None:
756 import re
757 _tagprog = re.compile('^(.*)#([^#]*)$')
758
759 match = _tagprog.match(url)
760 if match: return match.group(1, 2)
761 return url, None
762
763def splitattr(url):
764 """splitattr('/path;attr1=value1;attr2=value2;...') ->
765 '/path', ['attr1=value1', 'attr2=value2', ...]."""
766 words = url.split(';')
767 return words[0], words[1:]
768
769_valueprog = None
770def splitvalue(attr):
771 """splitvalue('attr=value') --> 'attr', 'value'."""
772 global _valueprog
773 if _valueprog is None:
774 import re
775 _valueprog = re.compile('^([^=]*)=(.*)$')
776
777 match = _valueprog.match(attr)
778 if match: return match.group(1, 2)
779 return attr, None
780
781test_input = """
782 http://a/b/c/d
783
784 g:h = <URL:g:h>
785 http:g = <URL:http://a/b/c/g>
786 http: = <URL:http://a/b/c/d>
787 g = <URL:http://a/b/c/g>
788 ./g = <URL:http://a/b/c/g>
789 g/ = <URL:http://a/b/c/g/>
790 /g = <URL:http://a/g>
791 //g = <URL:http://g>
792 ?y = <URL:http://a/b/c/d?y>
793 g?y = <URL:http://a/b/c/g?y>
794 g?y/./x = <URL:http://a/b/c/g?y/./x>
795 . = <URL:http://a/b/c/>
796 ./ = <URL:http://a/b/c/>
797 .. = <URL:http://a/b/>
798 ../ = <URL:http://a/b/>
799 ../g = <URL:http://a/b/g>
800 ../.. = <URL:http://a/>
801 ../../g = <URL:http://a/g>
802 ../../../g = <URL:http://a/../g>
803 ./../g = <URL:http://a/b/g>
804 ./g/. = <URL:http://a/b/c/g/>
805 /./g = <URL:http://a/./g>
806 g/./h = <URL:http://a/b/c/g/h>
807 g/../h = <URL:http://a/b/c/h>
808 http:g = <URL:http://a/b/c/g>
809 http: = <URL:http://a/b/c/d>
810 http:?y = <URL:http://a/b/c/d?y>
811 http:g?y = <URL:http://a/b/c/g?y>
812 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
813"""
814
815def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000816 base = ''
817 if sys.argv[1:]:
818 fn = sys.argv[1]
819 if fn == '-':
820 fp = sys.stdin
821 else:
822 fp = open(fn)
823 else:
824 from io import StringIO
825 fp = StringIO(test_input)
826 for line in fp:
827 words = line.split()
828 if not words:
829 continue
830 url = words[0]
831 parts = urlparse(url)
832 print('%-10s : %s' % (url, parts))
833 abs = urljoin(base, url)
834 if not base:
835 base = abs
836 wrapped = '<URL:%s>' % abs
837 print('%-10s = %s' % (url, wrapped))
838 if len(words) == 3 and words[1] == '=':
839 if wrapped != words[2]:
840 print('EXPECTED', words[2], '!!!!!!!!!!')
841
842if __name__ == '__main__':
843 test()