blob: 0b96b5b41d329fc4d8f388e8c7fa0e7b4d296663 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000034 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000035 "quote", "quote_plus", "quote_from_bytes",
36 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
41 'prospero', 'rtsp', 'rtspu', '', 'sftp']
42uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
43 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
44 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000045 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
47 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
48uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
49 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
50 'mms', '', 'sftp']
51uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
52 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
53uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
54 'nntp', 'wais', 'https', 'shttp', 'snews',
55 'file', 'prospero', '']
56
57# Characters valid in scheme names
58scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
59 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
60 '0123456789'
61 '+-.')
62
63MAX_CACHE_SIZE = 20
64_parse_cache = {}
65
66def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000067 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000069 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000070
71
72class ResultMixin(object):
73 """Shared methods for the parsed result objects."""
74
75 @property
76 def username(self):
77 netloc = self.netloc
78 if "@" in netloc:
79 userinfo = netloc.rsplit("@", 1)[0]
80 if ":" in userinfo:
81 userinfo = userinfo.split(":", 1)[0]
82 return userinfo
83 return None
84
85 @property
86 def password(self):
87 netloc = self.netloc
88 if "@" in netloc:
89 userinfo = netloc.rsplit("@", 1)[0]
90 if ":" in userinfo:
91 return userinfo.split(":", 1)[1]
92 return None
93
94 @property
95 def hostname(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +000096 netloc = self.netloc.split('@')[-1]
97 if '[' in netloc and ']' in netloc:
98 return netloc.split(']')[0][1:].lower()
Senthil Kumaranad02d232010-04-16 03:02:13 +000099 elif ':' in netloc:
100 return netloc.split(':')[0].lower()
101 elif netloc == '':
102 return None
103 else:
104 return netloc.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106 @property
107 def port(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +0000108 netloc = self.netloc.split('@')[-1].split(']')[-1]
109 if ':' in netloc:
110 port = netloc.split(':')[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000111 return int(port, 10)
Senthil Kumaranad02d232010-04-16 03:02:13 +0000112 else:
113 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000114
115from collections import namedtuple
116
117class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
118
119 __slots__ = ()
120
121 def geturl(self):
122 return urlunsplit(self)
123
124
125class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
126
127 __slots__ = ()
128
129 def geturl(self):
130 return urlunparse(self)
131
132
133def urlparse(url, scheme='', allow_fragments=True):
134 """Parse a URL into 6 components:
135 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
136 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
139 tuple = urlsplit(url, scheme, allow_fragments)
140 scheme, netloc, url, query, fragment = tuple
141 if scheme in uses_params and ';' in url:
142 url, params = _splitparams(url)
143 else:
144 params = ''
145 return ParseResult(scheme, netloc, url, params, query, fragment)
146
147def _splitparams(url):
148 if '/' in url:
149 i = url.find(';', url.rfind('/'))
150 if i < 0:
151 return url, ''
152 else:
153 i = url.find(';')
154 return url[:i], url[i+1:]
155
156def _splitnetloc(url, start=0):
157 delim = len(url) # position of end of domain part of url, default is end
158 for c in '/?#': # look for delimiters; the order is NOT important
159 wdelim = url.find(c, start) # find first of this delim
160 if wdelim >= 0: # if found
161 delim = min(delim, wdelim) # use earliest delim position
162 return url[start:delim], url[delim:] # return (domain, rest)
163
164def urlsplit(url, scheme='', allow_fragments=True):
165 """Parse a URL into 5 components:
166 <scheme>://<netloc>/<path>?<query>#<fragment>
167 Return a 5-tuple: (scheme, netloc, path, query, fragment).
168 Note that we don't break the components up in smaller bits
169 (e.g. netloc is a single string) and we don't expand % escapes."""
170 allow_fragments = bool(allow_fragments)
171 key = url, scheme, allow_fragments, type(url), type(scheme)
172 cached = _parse_cache.get(key, None)
173 if cached:
174 return cached
175 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
176 clear_cache()
177 netloc = query = fragment = ''
178 i = url.find(':')
179 if i > 0:
180 if url[:i] == 'http': # optimize the common case
181 scheme = url[:i].lower()
182 url = url[i+1:]
183 if url[:2] == '//':
184 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000185 if (('[' in netloc and ']' not in netloc) or
186 (']' in netloc and '[' not in netloc)):
187 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 if allow_fragments and '#' in url:
189 url, fragment = url.split('#', 1)
190 if '?' in url:
191 url, query = url.split('?', 1)
192 v = SplitResult(scheme, netloc, url, query, fragment)
193 _parse_cache[key] = v
194 return v
195 for c in url[:i]:
196 if c not in scheme_chars:
197 break
198 else:
199 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000200 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000201 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000202 if (('[' in netloc and ']' not in netloc) or
203 (']' in netloc and '[' not in netloc)):
204 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000205 if allow_fragments and scheme in uses_fragment and '#' in url:
206 url, fragment = url.split('#', 1)
207 if scheme in uses_query and '?' in url:
208 url, query = url.split('?', 1)
209 v = SplitResult(scheme, netloc, url, query, fragment)
210 _parse_cache[key] = v
211 return v
212
213def urlunparse(components):
214 """Put a parsed URL back together again. This may result in a
215 slightly different, but equivalent URL, if the URL that was parsed
216 originally had redundant delimiters, e.g. a ? with an empty query
217 (the draft states that these are equivalent)."""
218 scheme, netloc, url, params, query, fragment = components
219 if params:
220 url = "%s;%s" % (url, params)
221 return urlunsplit((scheme, netloc, url, query, fragment))
222
223def urlunsplit(components):
224 scheme, netloc, url, query, fragment = components
225 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
226 if url and url[:1] != '/': url = '/' + url
227 url = '//' + (netloc or '') + url
228 if scheme:
229 url = scheme + ':' + url
230 if query:
231 url = url + '?' + query
232 if fragment:
233 url = url + '#' + fragment
234 return url
235
236def urljoin(base, url, allow_fragments=True):
237 """Join a base URL and a possibly relative URL to form an absolute
238 interpretation of the latter."""
239 if not base:
240 return url
241 if not url:
242 return base
243 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
244 urlparse(base, '', allow_fragments)
245 scheme, netloc, path, params, query, fragment = \
246 urlparse(url, bscheme, allow_fragments)
247 if scheme != bscheme or scheme not in uses_relative:
248 return url
249 if scheme in uses_netloc:
250 if netloc:
251 return urlunparse((scheme, netloc, path,
252 params, query, fragment))
253 netloc = bnetloc
254 if path[:1] == '/':
255 return urlunparse((scheme, netloc, path,
256 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000257 if not path:
258 path = bpath
259 if not params:
260 params = bparams
261 else:
262 path = path[:-1]
263 return urlunparse((scheme, netloc, path,
264 params, query, fragment))
265 if not query:
266 query = bquery
267 return urlunparse((scheme, netloc, path,
268 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000269 segments = bpath.split('/')[:-1] + path.split('/')
270 # XXX The stuff below is bogus in various ways...
271 if segments[-1] == '.':
272 segments[-1] = ''
273 while '.' in segments:
274 segments.remove('.')
275 while 1:
276 i = 1
277 n = len(segments) - 1
278 while i < n:
279 if (segments[i] == '..'
280 and segments[i-1] not in ('', '..')):
281 del segments[i-1:i+1]
282 break
283 i = i+1
284 else:
285 break
286 if segments == ['', '..']:
287 segments[-1] = ''
288 elif len(segments) >= 2 and segments[-1] == '..':
289 segments[-2:] = ['']
290 return urlunparse((scheme, netloc, '/'.join(segments),
291 params, query, fragment))
292
293def urldefrag(url):
294 """Removes any existing fragment from URL.
295
296 Returns a tuple of the defragmented URL and the fragment. If
297 the URL contained no fragments, the second element is the
298 empty string.
299 """
300 if '#' in url:
301 s, n, p, a, q, frag = urlparse(url)
302 defrag = urlunparse((s, n, p, a, q, ''))
303 return defrag, frag
304 else:
305 return url, ''
306
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000307def unquote_to_bytes(string):
308 """unquote_to_bytes('abc%20def') -> b'abc def'."""
309 # Note: strings are encoded as UTF-8. This is only an issue if it contains
310 # unescaped non-ASCII characters, which URIs should not.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000311 if not string:
312 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000313 if isinstance(string, str):
314 string = string.encode('utf-8')
315 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000316 if len(res) == 1:
317 return string
318 string = res[0]
319 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000320 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000321 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000322 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000323 string += b'%' + item
324 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000325
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000326def unquote(string, encoding='utf-8', errors='replace'):
327 """Replace %xx escapes by their single-character equivalent. The optional
328 encoding and errors parameters specify how to decode percent-encoded
329 sequences into Unicode characters, as accepted by the bytes.decode()
330 method.
331 By default, percent-encoded sequences are decoded with UTF-8, and invalid
332 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000333
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000334 unquote('abc%20def') -> 'abc def'.
335 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000336 if not string:
337 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000338 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000339 if len(res) == 1:
340 return string
341 if encoding is None:
342 encoding = 'utf-8'
343 if errors is None:
344 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000345 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000346 pct_sequence = b''
347 string = res[0]
348 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000349 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000350 if not item:
351 raise ValueError
352 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000353 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000354 if not rest:
355 # This segment was just a single percent-encoded character.
356 # May be part of a sequence of code units, so delay decoding.
357 # (Stored in pct_sequence).
358 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000359 except ValueError:
360 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000361 # Encountered non-percent-encoded characters. Flush the current
362 # pct_sequence.
363 string += pct_sequence.decode(encoding, errors) + rest
364 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000365 if pct_sequence:
366 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000367 string += pct_sequence.decode(encoding, errors)
368 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000369
Georg Brandl3d6575d2009-09-16 14:36:22 +0000370def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000371 """Parse a query given as a string argument.
372
373 Arguments:
374
375 qs: URL-encoded query string to be parsed
376
377 keep_blank_values: flag indicating whether blank values in
378 URL encoded queries should be treated as blank strings.
379 A true value indicates that blanks should be retained as
380 blank strings. The default false value indicates that
381 blank values are to be ignored and treated as if they were
382 not included.
383
384 strict_parsing: flag indicating what to do with parsing errors.
385 If false (the default), errors are silently ignored.
386 If true, errors raise a ValueError exception.
387 """
388 dict = {}
389 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
390 if name in dict:
391 dict[name].append(value)
392 else:
393 dict[name] = [value]
394 return dict
395
Georg Brandl3d6575d2009-09-16 14:36:22 +0000396def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000397 """Parse a query given as a string argument.
398
399 Arguments:
400
401 qs: URL-encoded query string to be parsed
402
403 keep_blank_values: flag indicating whether blank values in
404 URL encoded queries should be treated as blank strings. A
405 true value indicates that blanks should be retained as blank
406 strings. The default false value indicates that blank values
407 are to be ignored and treated as if they were not included.
408
409 strict_parsing: flag indicating what to do with parsing errors. If
410 false (the default), errors are silently ignored. If true,
411 errors raise a ValueError exception.
412
413 Returns a list, as G-d intended.
414 """
415 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
416 r = []
417 for name_value in pairs:
418 if not name_value and not strict_parsing:
419 continue
420 nv = name_value.split('=', 1)
421 if len(nv) != 2:
422 if strict_parsing:
423 raise ValueError("bad query field: %r" % (name_value,))
424 # Handle case of a control-name with no equal sign
425 if keep_blank_values:
426 nv.append('')
427 else:
428 continue
429 if len(nv[1]) or keep_blank_values:
430 name = unquote(nv[0].replace('+', ' '))
431 value = unquote(nv[1].replace('+', ' '))
432 r.append((name, value))
433
434 return r
435
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000436def unquote_plus(string, encoding='utf-8', errors='replace'):
437 """Like unquote(), but also replace plus signs by spaces, as required for
438 unquoting HTML form values.
439
440 unquote_plus('%7e/abc+def') -> '~/abc def'
441 """
442 string = string.replace('+', ' ')
443 return unquote(string, encoding, errors)
444
445_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
446 b'abcdefghijklmnopqrstuvwxyz'
447 b'0123456789'
448 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000449_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
450_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000451
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000452class Quoter(collections.defaultdict):
453 """A mapping from bytes (in range(0,256)) to strings.
454
455 String values are percent-encoded byte values, unless the key < 128, and
456 in the "safe" set (either the specified safe set, or default set).
457 """
458 # Keeps a cache internally, using defaultdict, for efficiency (lookups
459 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000460 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000461 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000462 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000463
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000464 def __repr__(self):
465 # Without this, will just display as a defaultdict
466 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000467
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000468 def __missing__(self, b):
469 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000470 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000471 self[b] = res
472 return res
473
474def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000475 """quote('abc def') -> 'abc%20def'
476
477 Each part of a URL, e.g. the path info, the query, etc., has a
478 different set of reserved characters that must be quoted.
479
480 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
481 the following reserved characters.
482
483 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
484 "$" | ","
485
486 Each of these characters is reserved in some component of a URL,
487 but not necessarily in all of them.
488
489 By default, the quote function is intended for quoting the path
490 section of a URL. Thus, it will not encode '/'. This character
491 is reserved, but in typical usage the quote function is being
492 called on a path where the existing slash characters are used as
493 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000494
495 string and safe may be either str or bytes objects. encoding must
496 not be specified if string is a str.
497
498 The optional encoding and errors parameters specify how to deal with
499 non-ASCII characters, as accepted by the str.encode method.
500 By default, encoding='utf-8' (characters are encoded with UTF-8), and
501 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000502 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000503 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000504 if not string:
505 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000506 if encoding is None:
507 encoding = 'utf-8'
508 if errors is None:
509 errors = 'strict'
510 string = string.encode(encoding, errors)
511 else:
512 if encoding is not None:
513 raise TypeError("quote() doesn't support 'encoding' for bytes")
514 if errors is not None:
515 raise TypeError("quote() doesn't support 'errors' for bytes")
516 return quote_from_bytes(string, safe)
517
518def quote_plus(string, safe='', encoding=None, errors=None):
519 """Like quote(), but also replace ' ' with '+', as required for quoting
520 HTML form values. Plus signs in the original string are escaped unless
521 they are included in safe. It also does not have safe default to '/'.
522 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000523 # Check if ' ' in string, where string may either be a str or bytes. If
524 # there are no spaces, the regular quote will produce the right answer.
525 if ((isinstance(string, str) and ' ' not in string) or
526 (isinstance(string, bytes) and b' ' not in string)):
527 return quote(string, safe, encoding, errors)
528 if isinstance(safe, str):
529 space = ' '
530 else:
531 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000532 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000533 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000534
535def quote_from_bytes(bs, safe='/'):
536 """Like quote(), but accepts a bytes object rather than a str, and does
537 not perform string-to-bytes encoding. It always returns an ASCII string.
538 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
539 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000540 if not isinstance(bs, (bytes, bytearray)):
541 raise TypeError("quote_from_bytes() expected bytes")
542 if not bs:
543 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000544 if isinstance(safe, str):
545 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
546 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000547 else:
548 safe = bytes([c for c in safe if c < 128])
549 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
550 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000551 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000552 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000553 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000554 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
555 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000556
Georg Brandl3d6575d2009-09-16 14:36:22 +0000557def urlencode(query, doseq=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000558 """Encode a sequence of two-element tuples or dictionary into a URL query string.
559
560 If any values in the query arg are sequences and doseq is true, each
561 sequence element is converted to a separate parameter.
562
563 If the query arg is a sequence of two-element tuples, the order of the
564 parameters in the output will match the order of parameters in the
565 input.
566 """
567
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000568 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000569 query = query.items()
570 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000571 # It's a bother at times that strings and string-like objects are
572 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000573 try:
574 # non-sequence items should not work with len()
575 # non-empty strings will fail this
576 if len(query) and not isinstance(query[0], tuple):
577 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000578 # Zero-length sequences of all types will get here and succeed,
579 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000580 # allowed empty dicts that type of behavior probably should be
581 # preserved for consistency
582 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000583 ty, va, tb = sys.exc_info()
584 raise TypeError("not a valid non-string sequence "
585 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000586
587 l = []
588 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000589 for k, v in query:
590 k = quote_plus(str(k))
591 v = quote_plus(str(v))
592 l.append(k + '=' + v)
593 else:
594 for k, v in query:
595 k = quote_plus(str(k))
596 if isinstance(v, str):
597 v = quote_plus(v)
598 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000599 else:
600 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000601 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000602 x = len(v)
603 except TypeError:
604 # not a sequence
605 v = quote_plus(str(v))
606 l.append(k + '=' + v)
607 else:
608 # loop over the sequence
609 for elt in v:
610 l.append(k + '=' + quote_plus(str(elt)))
611 return '&'.join(l)
612
613# Utilities to parse URLs (most of these return None for missing parts):
614# unwrap('<URL:type://host/path>') --> 'type://host/path'
615# splittype('type:opaquestring') --> 'type', 'opaquestring'
616# splithost('//host[:port]/path') --> 'host[:port]', '/path'
617# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
618# splitpasswd('user:passwd') -> 'user', 'passwd'
619# splitport('host:port') --> 'host', 'port'
620# splitquery('/path?query') --> '/path', 'query'
621# splittag('/path#tag') --> '/path', 'tag'
622# splitattr('/path;attr1=value1;attr2=value2;...') ->
623# '/path', ['attr1=value1', 'attr2=value2', ...]
624# splitvalue('attr=value') --> 'attr', 'value'
625# urllib.parse.unquote('abc%20def') -> 'abc def'
626# quote('abc def') -> 'abc%20def')
627
Georg Brandl13e89462008-07-01 19:56:00 +0000628def to_bytes(url):
629 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000630 # Most URL schemes require ASCII. If that changes, the conversion
631 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000632 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000633 if isinstance(url, str):
634 try:
635 url = url.encode("ASCII").decode()
636 except UnicodeError:
637 raise UnicodeError("URL " + repr(url) +
638 " contains non-ASCII characters")
639 return url
640
641def unwrap(url):
642 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
643 url = str(url).strip()
644 if url[:1] == '<' and url[-1:] == '>':
645 url = url[1:-1].strip()
646 if url[:4] == 'URL:': url = url[4:].strip()
647 return url
648
649_typeprog = None
650def splittype(url):
651 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
652 global _typeprog
653 if _typeprog is None:
654 import re
655 _typeprog = re.compile('^([^/:]+):')
656
657 match = _typeprog.match(url)
658 if match:
659 scheme = match.group(1)
660 return scheme.lower(), url[len(scheme) + 1:]
661 return None, url
662
663_hostprog = None
664def splithost(url):
665 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
666 global _hostprog
667 if _hostprog is None:
668 import re
669 _hostprog = re.compile('^//([^/?]*)(.*)$')
670
671 match = _hostprog.match(url)
672 if match: return match.group(1, 2)
673 return None, url
674
675_userprog = None
676def splituser(host):
677 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
678 global _userprog
679 if _userprog is None:
680 import re
681 _userprog = re.compile('^(.*)@(.*)$')
682
683 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000684 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000685 return None, host
686
687_passwdprog = None
688def splitpasswd(user):
689 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
690 global _passwdprog
691 if _passwdprog is None:
692 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000693 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000694
695 match = _passwdprog.match(user)
696 if match: return match.group(1, 2)
697 return user, None
698
699# splittag('/path#tag') --> '/path', 'tag'
700_portprog = None
701def splitport(host):
702 """splitport('host:port') --> 'host', 'port'."""
703 global _portprog
704 if _portprog is None:
705 import re
706 _portprog = re.compile('^(.*):([0-9]+)$')
707
708 match = _portprog.match(host)
709 if match: return match.group(1, 2)
710 return host, None
711
712_nportprog = None
713def splitnport(host, defport=-1):
714 """Split host and port, returning numeric port.
715 Return given default port if no ':' found; defaults to -1.
716 Return numerical port if a valid number are found after ':'.
717 Return None if ':' but not a valid number."""
718 global _nportprog
719 if _nportprog is None:
720 import re
721 _nportprog = re.compile('^(.*):(.*)$')
722
723 match = _nportprog.match(host)
724 if match:
725 host, port = match.group(1, 2)
726 try:
727 if not port: raise ValueError("no digits")
728 nport = int(port)
729 except ValueError:
730 nport = None
731 return host, nport
732 return host, defport
733
734_queryprog = None
735def splitquery(url):
736 """splitquery('/path?query') --> '/path', 'query'."""
737 global _queryprog
738 if _queryprog is None:
739 import re
740 _queryprog = re.compile('^(.*)\?([^?]*)$')
741
742 match = _queryprog.match(url)
743 if match: return match.group(1, 2)
744 return url, None
745
746_tagprog = None
747def splittag(url):
748 """splittag('/path#tag') --> '/path', 'tag'."""
749 global _tagprog
750 if _tagprog is None:
751 import re
752 _tagprog = re.compile('^(.*)#([^#]*)$')
753
754 match = _tagprog.match(url)
755 if match: return match.group(1, 2)
756 return url, None
757
758def splitattr(url):
759 """splitattr('/path;attr1=value1;attr2=value2;...') ->
760 '/path', ['attr1=value1', 'attr2=value2', ...]."""
761 words = url.split(';')
762 return words[0], words[1:]
763
764_valueprog = None
765def splitvalue(attr):
766 """splitvalue('attr=value') --> 'attr', 'value'."""
767 global _valueprog
768 if _valueprog is None:
769 import re
770 _valueprog = re.compile('^([^=]*)=(.*)$')
771
772 match = _valueprog.match(attr)
773 if match: return match.group(1, 2)
774 return attr, None
775
776test_input = """
777 http://a/b/c/d
778
779 g:h = <URL:g:h>
780 http:g = <URL:http://a/b/c/g>
781 http: = <URL:http://a/b/c/d>
782 g = <URL:http://a/b/c/g>
783 ./g = <URL:http://a/b/c/g>
784 g/ = <URL:http://a/b/c/g/>
785 /g = <URL:http://a/g>
786 //g = <URL:http://g>
787 ?y = <URL:http://a/b/c/d?y>
788 g?y = <URL:http://a/b/c/g?y>
789 g?y/./x = <URL:http://a/b/c/g?y/./x>
790 . = <URL:http://a/b/c/>
791 ./ = <URL:http://a/b/c/>
792 .. = <URL:http://a/b/>
793 ../ = <URL:http://a/b/>
794 ../g = <URL:http://a/b/g>
795 ../.. = <URL:http://a/>
796 ../../g = <URL:http://a/g>
797 ../../../g = <URL:http://a/../g>
798 ./../g = <URL:http://a/b/g>
799 ./g/. = <URL:http://a/b/c/g/>
800 /./g = <URL:http://a/./g>
801 g/./h = <URL:http://a/b/c/g/h>
802 g/../h = <URL:http://a/b/c/h>
803 http:g = <URL:http://a/b/c/g>
804 http: = <URL:http://a/b/c/d>
805 http:?y = <URL:http://a/b/c/d?y>
806 http:g?y = <URL:http://a/b/c/g?y>
807 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
808"""
809
810def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000811 base = ''
812 if sys.argv[1:]:
813 fn = sys.argv[1]
814 if fn == '-':
815 fp = sys.stdin
816 else:
817 fp = open(fn)
818 else:
819 from io import StringIO
820 fp = StringIO(test_input)
821 for line in fp:
822 words = line.split()
823 if not words:
824 continue
825 url = words[0]
826 parts = urlparse(url)
827 print('%-10s : %s' % (url, parts))
828 abs = urljoin(base, url)
829 if not base:
830 base = abs
831 wrapped = '<URL:%s>' % abs
832 print('%-10s = %s' % (url, wrapped))
833 if len(words) == 3 and words[1] == '=':
834 if wrapped != words[2]:
835 print('EXPECTED', words[2], '!!!!!!!!!!')
836
837if __name__ == '__main__':
838 test()