blob: c6ebcc9f2feab75b6b3a00ac6754d0db6a4e88a6 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000034 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000035 "quote", "quote_plus", "quote_from_bytes",
36 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
41 'prospero', 'rtsp', 'rtspu', '', 'sftp']
42uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
43 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
44 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000045 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
47 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
48uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
49 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
50 'mms', '', 'sftp']
51uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
52 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
53uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
54 'nntp', 'wais', 'https', 'shttp', 'snews',
55 'file', 'prospero', '']
56
57# Characters valid in scheme names
58scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
59 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
60 '0123456789'
61 '+-.')
62
63MAX_CACHE_SIZE = 20
64_parse_cache = {}
65
66def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000067 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000069 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000070
71
72class ResultMixin(object):
73 """Shared methods for the parsed result objects."""
74
75 @property
76 def username(self):
77 netloc = self.netloc
78 if "@" in netloc:
79 userinfo = netloc.rsplit("@", 1)[0]
80 if ":" in userinfo:
81 userinfo = userinfo.split(":", 1)[0]
82 return userinfo
83 return None
84
85 @property
86 def password(self):
87 netloc = self.netloc
88 if "@" in netloc:
89 userinfo = netloc.rsplit("@", 1)[0]
90 if ":" in userinfo:
91 return userinfo.split(":", 1)[1]
92 return None
93
94 @property
95 def hostname(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +000096 netloc = self.netloc.split('@')[-1]
97 if '[' in netloc and ']' in netloc:
98 return netloc.split(']')[0][1:].lower()
Senthil Kumaranad02d232010-04-16 03:02:13 +000099 elif ':' in netloc:
100 return netloc.split(':')[0].lower()
101 elif netloc == '':
102 return None
103 else:
104 return netloc.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106 @property
107 def port(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +0000108 netloc = self.netloc.split('@')[-1].split(']')[-1]
109 if ':' in netloc:
110 port = netloc.split(':')[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000111 return int(port, 10)
Senthil Kumaranad02d232010-04-16 03:02:13 +0000112 else:
113 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000114
115from collections import namedtuple
116
117class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
118
119 __slots__ = ()
120
121 def geturl(self):
122 return urlunsplit(self)
123
124
125class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
126
127 __slots__ = ()
128
129 def geturl(self):
130 return urlunparse(self)
131
132
133def urlparse(url, scheme='', allow_fragments=True):
134 """Parse a URL into 6 components:
135 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
136 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
139 tuple = urlsplit(url, scheme, allow_fragments)
140 scheme, netloc, url, query, fragment = tuple
141 if scheme in uses_params and ';' in url:
142 url, params = _splitparams(url)
143 else:
144 params = ''
145 return ParseResult(scheme, netloc, url, params, query, fragment)
146
147def _splitparams(url):
148 if '/' in url:
149 i = url.find(';', url.rfind('/'))
150 if i < 0:
151 return url, ''
152 else:
153 i = url.find(';')
154 return url[:i], url[i+1:]
155
156def _splitnetloc(url, start=0):
157 delim = len(url) # position of end of domain part of url, default is end
158 for c in '/?#': # look for delimiters; the order is NOT important
159 wdelim = url.find(c, start) # find first of this delim
160 if wdelim >= 0: # if found
161 delim = min(delim, wdelim) # use earliest delim position
162 return url[start:delim], url[delim:] # return (domain, rest)
163
164def urlsplit(url, scheme='', allow_fragments=True):
165 """Parse a URL into 5 components:
166 <scheme>://<netloc>/<path>?<query>#<fragment>
167 Return a 5-tuple: (scheme, netloc, path, query, fragment).
168 Note that we don't break the components up in smaller bits
169 (e.g. netloc is a single string) and we don't expand % escapes."""
170 allow_fragments = bool(allow_fragments)
171 key = url, scheme, allow_fragments, type(url), type(scheme)
172 cached = _parse_cache.get(key, None)
173 if cached:
174 return cached
175 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
176 clear_cache()
177 netloc = query = fragment = ''
178 i = url.find(':')
179 if i > 0:
180 if url[:i] == 'http': # optimize the common case
181 scheme = url[:i].lower()
182 url = url[i+1:]
183 if url[:2] == '//':
184 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000185 if (('[' in netloc and ']' not in netloc) or
186 (']' in netloc and '[' not in netloc)):
187 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 if allow_fragments and '#' in url:
189 url, fragment = url.split('#', 1)
190 if '?' in url:
191 url, query = url.split('?', 1)
192 v = SplitResult(scheme, netloc, url, query, fragment)
193 _parse_cache[key] = v
194 return v
195 for c in url[:i]:
196 if c not in scheme_chars:
197 break
198 else:
199 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000200 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000201 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000202 if (('[' in netloc and ']' not in netloc) or
203 (']' in netloc and '[' not in netloc)):
204 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000205 if allow_fragments and scheme in uses_fragment and '#' in url:
206 url, fragment = url.split('#', 1)
207 if scheme in uses_query and '?' in url:
208 url, query = url.split('?', 1)
209 v = SplitResult(scheme, netloc, url, query, fragment)
210 _parse_cache[key] = v
211 return v
212
213def urlunparse(components):
214 """Put a parsed URL back together again. This may result in a
215 slightly different, but equivalent URL, if the URL that was parsed
216 originally had redundant delimiters, e.g. a ? with an empty query
217 (the draft states that these are equivalent)."""
218 scheme, netloc, url, params, query, fragment = components
219 if params:
220 url = "%s;%s" % (url, params)
221 return urlunsplit((scheme, netloc, url, query, fragment))
222
223def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000224 """Combine the elements of a tuple as returned by urlsplit() into a
225 complete URL as a string. The data argument can be any five-item iterable.
226 This may result in a slightly different, but equivalent URL, if the URL that
227 was parsed originally had unnecessary delimiters (for example, a ? with an
228 empty query; the RFC states that these are equivalent)."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000229 scheme, netloc, url, query, fragment = components
230 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
231 if url and url[:1] != '/': url = '/' + url
232 url = '//' + (netloc or '') + url
233 if scheme:
234 url = scheme + ':' + url
235 if query:
236 url = url + '?' + query
237 if fragment:
238 url = url + '#' + fragment
239 return url
240
241def urljoin(base, url, allow_fragments=True):
242 """Join a base URL and a possibly relative URL to form an absolute
243 interpretation of the latter."""
244 if not base:
245 return url
246 if not url:
247 return base
248 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
249 urlparse(base, '', allow_fragments)
250 scheme, netloc, path, params, query, fragment = \
251 urlparse(url, bscheme, allow_fragments)
252 if scheme != bscheme or scheme not in uses_relative:
253 return url
254 if scheme in uses_netloc:
255 if netloc:
256 return urlunparse((scheme, netloc, path,
257 params, query, fragment))
258 netloc = bnetloc
259 if path[:1] == '/':
260 return urlunparse((scheme, netloc, path,
261 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000262 if not path:
263 path = bpath
264 if not params:
265 params = bparams
266 else:
267 path = path[:-1]
268 return urlunparse((scheme, netloc, path,
269 params, query, fragment))
270 if not query:
271 query = bquery
272 return urlunparse((scheme, netloc, path,
273 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000274 segments = bpath.split('/')[:-1] + path.split('/')
275 # XXX The stuff below is bogus in various ways...
276 if segments[-1] == '.':
277 segments[-1] = ''
278 while '.' in segments:
279 segments.remove('.')
280 while 1:
281 i = 1
282 n = len(segments) - 1
283 while i < n:
284 if (segments[i] == '..'
285 and segments[i-1] not in ('', '..')):
286 del segments[i-1:i+1]
287 break
288 i = i+1
289 else:
290 break
291 if segments == ['', '..']:
292 segments[-1] = ''
293 elif len(segments) >= 2 and segments[-1] == '..':
294 segments[-2:] = ['']
295 return urlunparse((scheme, netloc, '/'.join(segments),
296 params, query, fragment))
297
298def urldefrag(url):
299 """Removes any existing fragment from URL.
300
301 Returns a tuple of the defragmented URL and the fragment. If
302 the URL contained no fragments, the second element is the
303 empty string.
304 """
305 if '#' in url:
306 s, n, p, a, q, frag = urlparse(url)
307 defrag = urlunparse((s, n, p, a, q, ''))
308 return defrag, frag
309 else:
310 return url, ''
311
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000312def unquote_to_bytes(string):
313 """unquote_to_bytes('abc%20def') -> b'abc def'."""
314 # Note: strings are encoded as UTF-8. This is only an issue if it contains
315 # unescaped non-ASCII characters, which URIs should not.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000316 if not string:
317 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000318 if isinstance(string, str):
319 string = string.encode('utf-8')
320 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000321 if len(res) == 1:
322 return string
323 string = res[0]
324 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000325 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000326 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000327 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000328 string += b'%' + item
329 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000331def unquote(string, encoding='utf-8', errors='replace'):
332 """Replace %xx escapes by their single-character equivalent. The optional
333 encoding and errors parameters specify how to decode percent-encoded
334 sequences into Unicode characters, as accepted by the bytes.decode()
335 method.
336 By default, percent-encoded sequences are decoded with UTF-8, and invalid
337 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000338
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000339 unquote('abc%20def') -> 'abc def'.
340 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000341 if not string:
342 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000343 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000344 if len(res) == 1:
345 return string
346 if encoding is None:
347 encoding = 'utf-8'
348 if errors is None:
349 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000350 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000351 pct_sequence = b''
352 string = res[0]
353 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000354 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000355 if not item:
356 raise ValueError
357 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000358 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000359 if not rest:
360 # This segment was just a single percent-encoded character.
361 # May be part of a sequence of code units, so delay decoding.
362 # (Stored in pct_sequence).
363 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000364 except ValueError:
365 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000366 # Encountered non-percent-encoded characters. Flush the current
367 # pct_sequence.
368 string += pct_sequence.decode(encoding, errors) + rest
369 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000370 if pct_sequence:
371 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000372 string += pct_sequence.decode(encoding, errors)
373 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000374
Georg Brandl3d6575d2009-09-16 14:36:22 +0000375def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000376 """Parse a query given as a string argument.
377
378 Arguments:
379
380 qs: URL-encoded query string to be parsed
381
382 keep_blank_values: flag indicating whether blank values in
383 URL encoded queries should be treated as blank strings.
384 A true value indicates that blanks should be retained as
385 blank strings. The default false value indicates that
386 blank values are to be ignored and treated as if they were
387 not included.
388
389 strict_parsing: flag indicating what to do with parsing errors.
390 If false (the default), errors are silently ignored.
391 If true, errors raise a ValueError exception.
392 """
393 dict = {}
394 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
395 if name in dict:
396 dict[name].append(value)
397 else:
398 dict[name] = [value]
399 return dict
400
Georg Brandl3d6575d2009-09-16 14:36:22 +0000401def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000402 """Parse a query given as a string argument.
403
404 Arguments:
405
406 qs: URL-encoded query string to be parsed
407
408 keep_blank_values: flag indicating whether blank values in
409 URL encoded queries should be treated as blank strings. A
410 true value indicates that blanks should be retained as blank
411 strings. The default false value indicates that blank values
412 are to be ignored and treated as if they were not included.
413
414 strict_parsing: flag indicating what to do with parsing errors. If
415 false (the default), errors are silently ignored. If true,
416 errors raise a ValueError exception.
417
418 Returns a list, as G-d intended.
419 """
420 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
421 r = []
422 for name_value in pairs:
423 if not name_value and not strict_parsing:
424 continue
425 nv = name_value.split('=', 1)
426 if len(nv) != 2:
427 if strict_parsing:
428 raise ValueError("bad query field: %r" % (name_value,))
429 # Handle case of a control-name with no equal sign
430 if keep_blank_values:
431 nv.append('')
432 else:
433 continue
434 if len(nv[1]) or keep_blank_values:
435 name = unquote(nv[0].replace('+', ' '))
436 value = unquote(nv[1].replace('+', ' '))
437 r.append((name, value))
438
439 return r
440
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000441def unquote_plus(string, encoding='utf-8', errors='replace'):
442 """Like unquote(), but also replace plus signs by spaces, as required for
443 unquoting HTML form values.
444
445 unquote_plus('%7e/abc+def') -> '~/abc def'
446 """
447 string = string.replace('+', ' ')
448 return unquote(string, encoding, errors)
449
450_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
451 b'abcdefghijklmnopqrstuvwxyz'
452 b'0123456789'
453 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000454_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
455_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000456
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000457class Quoter(collections.defaultdict):
458 """A mapping from bytes (in range(0,256)) to strings.
459
460 String values are percent-encoded byte values, unless the key < 128, and
461 in the "safe" set (either the specified safe set, or default set).
462 """
463 # Keeps a cache internally, using defaultdict, for efficiency (lookups
464 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000465 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000466 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000467 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000468
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000469 def __repr__(self):
470 # Without this, will just display as a defaultdict
471 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000472
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000473 def __missing__(self, b):
474 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000475 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000476 self[b] = res
477 return res
478
479def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000480 """quote('abc def') -> 'abc%20def'
481
482 Each part of a URL, e.g. the path info, the query, etc., has a
483 different set of reserved characters that must be quoted.
484
485 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
486 the following reserved characters.
487
488 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
489 "$" | ","
490
491 Each of these characters is reserved in some component of a URL,
492 but not necessarily in all of them.
493
494 By default, the quote function is intended for quoting the path
495 section of a URL. Thus, it will not encode '/'. This character
496 is reserved, but in typical usage the quote function is being
497 called on a path where the existing slash characters are used as
498 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000499
500 string and safe may be either str or bytes objects. encoding must
501 not be specified if string is a str.
502
503 The optional encoding and errors parameters specify how to deal with
504 non-ASCII characters, as accepted by the str.encode method.
505 By default, encoding='utf-8' (characters are encoded with UTF-8), and
506 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000507 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000508 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000509 if not string:
510 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000511 if encoding is None:
512 encoding = 'utf-8'
513 if errors is None:
514 errors = 'strict'
515 string = string.encode(encoding, errors)
516 else:
517 if encoding is not None:
518 raise TypeError("quote() doesn't support 'encoding' for bytes")
519 if errors is not None:
520 raise TypeError("quote() doesn't support 'errors' for bytes")
521 return quote_from_bytes(string, safe)
522
523def quote_plus(string, safe='', encoding=None, errors=None):
524 """Like quote(), but also replace ' ' with '+', as required for quoting
525 HTML form values. Plus signs in the original string are escaped unless
526 they are included in safe. It also does not have safe default to '/'.
527 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000528 # Check if ' ' in string, where string may either be a str or bytes. If
529 # there are no spaces, the regular quote will produce the right answer.
530 if ((isinstance(string, str) and ' ' not in string) or
531 (isinstance(string, bytes) and b' ' not in string)):
532 return quote(string, safe, encoding, errors)
533 if isinstance(safe, str):
534 space = ' '
535 else:
536 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000537 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000538 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000539
540def quote_from_bytes(bs, safe='/'):
541 """Like quote(), but accepts a bytes object rather than a str, and does
542 not perform string-to-bytes encoding. It always returns an ASCII string.
543 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
544 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000545 if not isinstance(bs, (bytes, bytearray)):
546 raise TypeError("quote_from_bytes() expected bytes")
547 if not bs:
548 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000549 if isinstance(safe, str):
550 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
551 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000552 else:
553 safe = bytes([c for c in safe if c < 128])
554 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
555 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000556 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000557 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000558 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000559 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
560 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000561
Senthil Kumarandf022da2010-07-03 17:48:22 +0000562def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000563 """Encode a sequence of two-element tuples or dictionary into a URL query string.
564
565 If any values in the query arg are sequences and doseq is true, each
566 sequence element is converted to a separate parameter.
567
568 If the query arg is a sequence of two-element tuples, the order of the
569 parameters in the output will match the order of parameters in the
570 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000571
572 The query arg may be either a string or a bytes type. When query arg is a
573 string, the safe, encoding and error parameters are sent the quote_plus for
574 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000575 """
576
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000577 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000578 query = query.items()
579 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000580 # It's a bother at times that strings and string-like objects are
581 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000582 try:
583 # non-sequence items should not work with len()
584 # non-empty strings will fail this
585 if len(query) and not isinstance(query[0], tuple):
586 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000587 # Zero-length sequences of all types will get here and succeed,
588 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000589 # allowed empty dicts that type of behavior probably should be
590 # preserved for consistency
591 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000592 ty, va, tb = sys.exc_info()
593 raise TypeError("not a valid non-string sequence "
594 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000595
596 l = []
597 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000598 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000599 if isinstance(k, bytes):
600 k = quote_plus(k, safe)
601 else:
602 k = quote_plus(str(k), safe, encoding, errors)
603
604 if isinstance(v, bytes):
605 v = quote_plus(v, safe)
606 else:
607 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000608 l.append(k + '=' + v)
609 else:
610 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000611 if isinstance(k, bytes):
612 k = quote_plus(k, safe)
613 else:
614 k = quote_plus(str(k), safe, encoding, errors)
615
616 if isinstance(v, bytes):
617 v = quote_plus(v, safe)
618 l.append(k + '=' + v)
619 elif isinstance(v, str):
620 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000621 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000622 else:
623 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000624 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000625 x = len(v)
626 except TypeError:
627 # not a sequence
Senthil Kumarandf022da2010-07-03 17:48:22 +0000628 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000629 l.append(k + '=' + v)
630 else:
631 # loop over the sequence
632 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000633 if isinstance(elt, bytes):
634 elt = quote_plus(elt, safe)
635 else:
636 elt = quote_plus(str(elt), safe, encoding, errors)
637 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000638 return '&'.join(l)
639
640# Utilities to parse URLs (most of these return None for missing parts):
641# unwrap('<URL:type://host/path>') --> 'type://host/path'
642# splittype('type:opaquestring') --> 'type', 'opaquestring'
643# splithost('//host[:port]/path') --> 'host[:port]', '/path'
644# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
645# splitpasswd('user:passwd') -> 'user', 'passwd'
646# splitport('host:port') --> 'host', 'port'
647# splitquery('/path?query') --> '/path', 'query'
648# splittag('/path#tag') --> '/path', 'tag'
649# splitattr('/path;attr1=value1;attr2=value2;...') ->
650# '/path', ['attr1=value1', 'attr2=value2', ...]
651# splitvalue('attr=value') --> 'attr', 'value'
652# urllib.parse.unquote('abc%20def') -> 'abc def'
653# quote('abc def') -> 'abc%20def')
654
Georg Brandl13e89462008-07-01 19:56:00 +0000655def to_bytes(url):
656 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000657 # Most URL schemes require ASCII. If that changes, the conversion
658 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000659 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000660 if isinstance(url, str):
661 try:
662 url = url.encode("ASCII").decode()
663 except UnicodeError:
664 raise UnicodeError("URL " + repr(url) +
665 " contains non-ASCII characters")
666 return url
667
668def unwrap(url):
669 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
670 url = str(url).strip()
671 if url[:1] == '<' and url[-1:] == '>':
672 url = url[1:-1].strip()
673 if url[:4] == 'URL:': url = url[4:].strip()
674 return url
675
676_typeprog = None
677def splittype(url):
678 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
679 global _typeprog
680 if _typeprog is None:
681 import re
682 _typeprog = re.compile('^([^/:]+):')
683
684 match = _typeprog.match(url)
685 if match:
686 scheme = match.group(1)
687 return scheme.lower(), url[len(scheme) + 1:]
688 return None, url
689
690_hostprog = None
691def splithost(url):
692 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
693 global _hostprog
694 if _hostprog is None:
695 import re
696 _hostprog = re.compile('^//([^/?]*)(.*)$')
697
698 match = _hostprog.match(url)
699 if match: return match.group(1, 2)
700 return None, url
701
702_userprog = None
703def splituser(host):
704 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
705 global _userprog
706 if _userprog is None:
707 import re
708 _userprog = re.compile('^(.*)@(.*)$')
709
710 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000711 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000712 return None, host
713
714_passwdprog = None
715def splitpasswd(user):
716 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
717 global _passwdprog
718 if _passwdprog is None:
719 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000720 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000721
722 match = _passwdprog.match(user)
723 if match: return match.group(1, 2)
724 return user, None
725
726# splittag('/path#tag') --> '/path', 'tag'
727_portprog = None
728def splitport(host):
729 """splitport('host:port') --> 'host', 'port'."""
730 global _portprog
731 if _portprog is None:
732 import re
733 _portprog = re.compile('^(.*):([0-9]+)$')
734
735 match = _portprog.match(host)
736 if match: return match.group(1, 2)
737 return host, None
738
739_nportprog = None
740def splitnport(host, defport=-1):
741 """Split host and port, returning numeric port.
742 Return given default port if no ':' found; defaults to -1.
743 Return numerical port if a valid number are found after ':'.
744 Return None if ':' but not a valid number."""
745 global _nportprog
746 if _nportprog is None:
747 import re
748 _nportprog = re.compile('^(.*):(.*)$')
749
750 match = _nportprog.match(host)
751 if match:
752 host, port = match.group(1, 2)
753 try:
754 if not port: raise ValueError("no digits")
755 nport = int(port)
756 except ValueError:
757 nport = None
758 return host, nport
759 return host, defport
760
761_queryprog = None
762def splitquery(url):
763 """splitquery('/path?query') --> '/path', 'query'."""
764 global _queryprog
765 if _queryprog is None:
766 import re
767 _queryprog = re.compile('^(.*)\?([^?]*)$')
768
769 match = _queryprog.match(url)
770 if match: return match.group(1, 2)
771 return url, None
772
773_tagprog = None
774def splittag(url):
775 """splittag('/path#tag') --> '/path', 'tag'."""
776 global _tagprog
777 if _tagprog is None:
778 import re
779 _tagprog = re.compile('^(.*)#([^#]*)$')
780
781 match = _tagprog.match(url)
782 if match: return match.group(1, 2)
783 return url, None
784
785def splitattr(url):
786 """splitattr('/path;attr1=value1;attr2=value2;...') ->
787 '/path', ['attr1=value1', 'attr2=value2', ...]."""
788 words = url.split(';')
789 return words[0], words[1:]
790
791_valueprog = None
792def splitvalue(attr):
793 """splitvalue('attr=value') --> 'attr', 'value'."""
794 global _valueprog
795 if _valueprog is None:
796 import re
797 _valueprog = re.compile('^([^=]*)=(.*)$')
798
799 match = _valueprog.match(attr)
800 if match: return match.group(1, 2)
801 return attr, None
802
803test_input = """
804 http://a/b/c/d
805
806 g:h = <URL:g:h>
807 http:g = <URL:http://a/b/c/g>
808 http: = <URL:http://a/b/c/d>
809 g = <URL:http://a/b/c/g>
810 ./g = <URL:http://a/b/c/g>
811 g/ = <URL:http://a/b/c/g/>
812 /g = <URL:http://a/g>
813 //g = <URL:http://g>
814 ?y = <URL:http://a/b/c/d?y>
815 g?y = <URL:http://a/b/c/g?y>
816 g?y/./x = <URL:http://a/b/c/g?y/./x>
817 . = <URL:http://a/b/c/>
818 ./ = <URL:http://a/b/c/>
819 .. = <URL:http://a/b/>
820 ../ = <URL:http://a/b/>
821 ../g = <URL:http://a/b/g>
822 ../.. = <URL:http://a/>
823 ../../g = <URL:http://a/g>
824 ../../../g = <URL:http://a/../g>
825 ./../g = <URL:http://a/b/g>
826 ./g/. = <URL:http://a/b/c/g/>
827 /./g = <URL:http://a/./g>
828 g/./h = <URL:http://a/b/c/g/h>
829 g/../h = <URL:http://a/b/c/h>
830 http:g = <URL:http://a/b/c/g>
831 http: = <URL:http://a/b/c/d>
832 http:?y = <URL:http://a/b/c/d?y>
833 http:g?y = <URL:http://a/b/c/g?y>
834 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
835"""
836
837def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000838 base = ''
839 if sys.argv[1:]:
840 fn = sys.argv[1]
841 if fn == '-':
842 fp = sys.stdin
843 else:
844 fp = open(fn)
845 else:
846 from io import StringIO
847 fp = StringIO(test_input)
848 for line in fp:
849 words = line.split()
850 if not words:
851 continue
852 url = words[0]
853 parts = urlparse(url)
854 print('%-10s : %s' % (url, parts))
855 abs = urljoin(base, url)
856 if not base:
857 base = abs
858 wrapped = '<URL:%s>' % abs
859 print('%-10s = %s' % (url, wrapped))
860 if len(words) == 3 and words[1] == '=':
861 if wrapped != words[2]:
862 print('EXPECTED', words[2], '!!!!!!!!!!')
863
864if __name__ == '__main__':
865 test()