blob: f1e54d35e98c23ca6e3a9d759fbe3a9a2da7b09f [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran420ec8a2010-04-17 14:30:53 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any changes to urlparse module
23should conform to this. urlparse module is not entirely compliant with this.
24The defacto scenarios of parsing are considered sometimes and for backward
25compatiblity purposes, older RFC uses of parsing are retained. The testcases in
26test_urlparse.py provides a good indicator of parsing behavior.
27
Guido van Rossume7b146f2000-02-04 15:28:42 +000028"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000029
Fred Drakef606e8d2002-10-16 21:21:39 +000030__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000031 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000032
Guido van Rossum23cb2a81994-09-12 10:36:35 +000033# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000034uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000035 'wais', 'file', 'https', 'shttp', 'mms',
36 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000037uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000038 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
39 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran5e95e762009-03-30 21:51:50 +000040 'svn', 'svn+ssh', 'sftp','nfs']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000041non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000042 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000043uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000044 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000045 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000046uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000047 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000048uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000049 'nntp', 'wais', 'https', 'shttp', 'snews',
50 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000051
52# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000053scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
54 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
55 '0123456789'
56 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000057
Guido van Rossum74495401997-07-14 19:08:15 +000058MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000059_parse_cache = {}
60
61def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000062 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000063 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000064
65
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000066class ResultMixin(object):
67 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000068
69 @property
70 def username(self):
71 netloc = self.netloc
72 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000073 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000074 if ":" in userinfo:
75 userinfo = userinfo.split(":", 1)[0]
76 return userinfo
77 return None
78
79 @property
80 def password(self):
81 netloc = self.netloc
82 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000083 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000084 if ":" in userinfo:
85 return userinfo.split(":", 1)[1]
86 return None
87
88 @property
89 def hostname(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000090 netloc = self.netloc.split('@')[-1]
91 if '[' in netloc and ']' in netloc:
92 return netloc.split(']')[0][1:].lower()
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000093 elif ':' in netloc:
94 return netloc.split(':')[0].lower()
95 elif netloc == '':
96 return None
97 else:
98 return netloc.lower()
Fred Drakead5177c2006-04-01 22:14:43 +000099
100 @property
101 def port(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000102 netloc = self.netloc.split('@')[-1].split(']')[-1]
103 if ':' in netloc:
104 port = netloc.split(':')[1]
Fred Drakead5177c2006-04-01 22:14:43 +0000105 return int(port, 10)
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000106 else:
107 return None
Fred Drakead5177c2006-04-01 22:14:43 +0000108
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000109from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000110
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000111class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000112
113 __slots__ = ()
114
Fred Drakead5177c2006-04-01 22:14:43 +0000115 def geturl(self):
116 return urlunsplit(self)
117
118
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000119class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000120
121 __slots__ = ()
122
Fred Drakead5177c2006-04-01 22:14:43 +0000123 def geturl(self):
124 return urlunparse(self)
125
126
127def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000128 """Parse a URL into 6 components:
129 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
130 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
131 Note that we don't break the components up in smaller bits
132 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000133 tuple = urlsplit(url, scheme, allow_fragments)
134 scheme, netloc, url, query, fragment = tuple
135 if scheme in uses_params and ';' in url:
136 url, params = _splitparams(url)
137 else:
138 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000139 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000140
141def _splitparams(url):
142 if '/' in url:
143 i = url.find(';', url.rfind('/'))
144 if i < 0:
145 return url, ''
146 else:
147 i = url.find(';')
148 return url[:i], url[i+1:]
149
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000150def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000151 delim = len(url) # position of end of domain part of url, default is end
152 for c in '/?#': # look for delimiters; the order is NOT important
153 wdelim = url.find(c, start) # find first of this delim
154 if wdelim >= 0: # if found
155 delim = min(delim, wdelim) # use earliest delim position
156 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000157
Fred Drakead5177c2006-04-01 22:14:43 +0000158def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000159 """Parse a URL into 5 components:
160 <scheme>://<netloc>/<path>?<query>#<fragment>
161 Return a 5-tuple: (scheme, netloc, path, query, fragment).
162 Note that we don't break the components up in smaller bits
163 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000164 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000165 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000166 cached = _parse_cache.get(key, None)
167 if cached:
168 return cached
169 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
170 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000171 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000172 i = url.find(':')
173 if i > 0:
174 if url[:i] == 'http': # optimize the common case
175 scheme = url[:i].lower()
176 url = url[i+1:]
177 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000178 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000179 if (('[' in netloc and ']' not in netloc) or
180 (']' in netloc and '[' not in netloc)):
181 raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000182 if allow_fragments and '#' in url:
183 url, fragment = url.split('#', 1)
184 if '?' in url:
185 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000186 v = SplitResult(scheme, netloc, url, query, fragment)
187 _parse_cache[key] = v
188 return v
Tim Peterse1190062001-01-15 03:34:38 +0000189 for c in url[:i]:
190 if c not in scheme_chars:
191 break
192 else:
193 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000194
195 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000196 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000197 if (('[' in netloc and ']' not in netloc) or
198 (']' in netloc and '[' not in netloc)):
199 raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000200 if allow_fragments and scheme in uses_fragment and '#' in url:
201 url, fragment = url.split('#', 1)
202 if scheme in uses_query and '?' in url:
203 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000204 v = SplitResult(scheme, netloc, url, query, fragment)
205 _parse_cache[key] = v
206 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000207
Brett Cannon89318d82008-08-03 00:51:02 +0000208def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000209 """Put a parsed URL back together again. This may result in a
210 slightly different, but equivalent URL, if the URL that was parsed
211 originally had redundant delimiters, e.g. a ? with an empty query
212 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000213 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000214 if params:
215 url = "%s;%s" % (url, params)
216 return urlunsplit((scheme, netloc, url, query, fragment))
217
Brett Cannon89318d82008-08-03 00:51:02 +0000218def urlunsplit(data):
219 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000220 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000221 if url and url[:1] != '/': url = '/' + url
222 url = '//' + (netloc or '') + url
223 if scheme:
224 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000225 if query:
226 url = url + '?' + query
227 if fragment:
228 url = url + '#' + fragment
229 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000230
Fred Drakead5177c2006-04-01 22:14:43 +0000231def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000232 """Join a base URL and a possibly relative URL to form an absolute
233 interpretation of the latter."""
234 if not base:
235 return url
236 if not url:
237 return base
238 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
239 urlparse(base, '', allow_fragments)
240 scheme, netloc, path, params, query, fragment = \
241 urlparse(url, bscheme, allow_fragments)
242 if scheme != bscheme or scheme not in uses_relative:
243 return url
244 if scheme in uses_netloc:
245 if netloc:
246 return urlunparse((scheme, netloc, path,
247 params, query, fragment))
248 netloc = bnetloc
249 if path[:1] == '/':
250 return urlunparse((scheme, netloc, path,
251 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000252 if not path:
253 path = bpath
254 if not params:
255 params = bparams
256 else:
257 path = path[:-1]
258 return urlunparse((scheme, netloc, path,
259 params, query, fragment))
260 if not query:
261 query = bquery
262 return urlunparse((scheme, netloc, path,
263 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000264 segments = bpath.split('/')[:-1] + path.split('/')
265 # XXX The stuff below is bogus in various ways...
266 if segments[-1] == '.':
267 segments[-1] = ''
268 while '.' in segments:
269 segments.remove('.')
270 while 1:
271 i = 1
272 n = len(segments) - 1
273 while i < n:
274 if (segments[i] == '..'
275 and segments[i-1] not in ('', '..')):
276 del segments[i-1:i+1]
277 break
278 i = i+1
279 else:
280 break
281 if segments == ['', '..']:
282 segments[-1] = ''
283 elif len(segments) >= 2 and segments[-1] == '..':
284 segments[-2:] = ['']
285 return urlunparse((scheme, netloc, '/'.join(segments),
286 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000287
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000288def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000289 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000290
Tim Peterse1190062001-01-15 03:34:38 +0000291 Returns a tuple of the defragmented URL and the fragment. If
292 the URL contained no fragments, the second element is the
293 empty string.
294 """
Fred Drake5751a222001-11-16 02:52:57 +0000295 if '#' in url:
296 s, n, p, a, q, frag = urlparse(url)
297 defrag = urlunparse((s, n, p, a, q, ''))
298 return defrag, frag
299 else:
300 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000301
Facundo Batistac585df92008-09-03 22:35:50 +0000302# unquote method for parse_qs and parse_qsl
303# Cannot use directly from urllib as it would create circular reference.
304# urllib uses urlparse methods ( urljoin)
305
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000306
307_hexdig = '0123456789ABCDEFabcdef'
308_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000309
310def unquote(s):
311 """unquote('abc%20def') -> 'abc def'."""
312 res = s.split('%')
313 for i in xrange(1, len(res)):
314 item = res[i]
315 try:
316 res[i] = _hextochr[item[:2]] + item[2:]
317 except KeyError:
318 res[i] = '%' + item
319 except UnicodeDecodeError:
320 res[i] = unichr(int(item[:2], 16)) + item[2:]
321 return "".join(res)
322
323def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
324 """Parse a query given as a string argument.
325
326 Arguments:
327
328 qs: URL-encoded query string to be parsed
329
330 keep_blank_values: flag indicating whether blank values in
331 URL encoded queries should be treated as blank strings.
332 A true value indicates that blanks should be retained as
333 blank strings. The default false value indicates that
334 blank values are to be ignored and treated as if they were
335 not included.
336
337 strict_parsing: flag indicating what to do with parsing errors.
338 If false (the default), errors are silently ignored.
339 If true, errors raise a ValueError exception.
340 """
341 dict = {}
342 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
343 if name in dict:
344 dict[name].append(value)
345 else:
346 dict[name] = [value]
347 return dict
348
349def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
350 """Parse a query given as a string argument.
351
352 Arguments:
353
354 qs: URL-encoded query string to be parsed
355
356 keep_blank_values: flag indicating whether blank values in
357 URL encoded queries should be treated as blank strings. A
358 true value indicates that blanks should be retained as blank
359 strings. The default false value indicates that blank values
360 are to be ignored and treated as if they were not included.
361
362 strict_parsing: flag indicating what to do with parsing errors. If
363 false (the default), errors are silently ignored. If true,
364 errors raise a ValueError exception.
365
366 Returns a list, as G-d intended.
367 """
368 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
369 r = []
370 for name_value in pairs:
371 if not name_value and not strict_parsing:
372 continue
373 nv = name_value.split('=', 1)
374 if len(nv) != 2:
375 if strict_parsing:
376 raise ValueError, "bad query field: %r" % (name_value,)
377 # Handle case of a control-name with no equal sign
378 if keep_blank_values:
379 nv.append('')
380 else:
381 continue
382 if len(nv[1]) or keep_blank_values:
383 name = unquote(nv[0].replace('+', ' '))
384 value = unquote(nv[1].replace('+', ' '))
385 r.append((name, value))
386
387 return r
388
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000389
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000390test_input = """
391 http://a/b/c/d
392
393 g:h = <URL:g:h>
394 http:g = <URL:http://a/b/c/g>
395 http: = <URL:http://a/b/c/d>
396 g = <URL:http://a/b/c/g>
397 ./g = <URL:http://a/b/c/g>
398 g/ = <URL:http://a/b/c/g/>
399 /g = <URL:http://a/g>
400 //g = <URL:http://g>
401 ?y = <URL:http://a/b/c/d?y>
402 g?y = <URL:http://a/b/c/g?y>
403 g?y/./x = <URL:http://a/b/c/g?y/./x>
404 . = <URL:http://a/b/c/>
405 ./ = <URL:http://a/b/c/>
406 .. = <URL:http://a/b/>
407 ../ = <URL:http://a/b/>
408 ../g = <URL:http://a/b/g>
409 ../.. = <URL:http://a/>
410 ../../g = <URL:http://a/g>
411 ../../../g = <URL:http://a/../g>
412 ./../g = <URL:http://a/b/g>
413 ./g/. = <URL:http://a/b/c/g/>
414 /./g = <URL:http://a/./g>
415 g/./h = <URL:http://a/b/c/g/h>
416 g/../h = <URL:http://a/b/c/h>
417 http:g = <URL:http://a/b/c/g>
418 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000419 http:?y = <URL:http://a/b/c/d?y>
420 http:g?y = <URL:http://a/b/c/g?y>
421 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000422"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000423
424def test():
Tim Peterse1190062001-01-15 03:34:38 +0000425 import sys
426 base = ''
427 if sys.argv[1:]:
428 fn = sys.argv[1]
429 if fn == '-':
430 fp = sys.stdin
431 else:
432 fp = open(fn)
433 else:
Raymond Hettingera6172712004-12-31 19:15:26 +0000434 try:
435 from cStringIO import StringIO
436 except ImportError:
437 from StringIO import StringIO
438 fp = StringIO(test_input)
Georg Brandl1ea8cb42008-01-21 17:22:06 +0000439 for line in fp:
Tim Peterse1190062001-01-15 03:34:38 +0000440 words = line.split()
441 if not words:
442 continue
443 url = words[0]
444 parts = urlparse(url)
445 print '%-10s : %s' % (url, parts)
446 abs = urljoin(base, url)
447 if not base:
448 base = abs
449 wrapped = '<URL:%s>' % abs
450 print '%-10s = %s' % (url, wrapped)
451 if len(words) == 3 and words[1] == '=':
452 if wrapped != words[2]:
453 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000454
455if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000456 test()