blob: 7f3a9a4b5c8ae634a1c172409c6790a64f813447 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran420ec8a2010-04-17 14:30:53 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000020McCahill, December 1994
21
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000027test_urlparse.py provides a good indicator of parsing behavior.
28
Guido van Rossume7b146f2000-02-04 15:28:42 +000029"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000030
Fred Drakef606e8d2002-10-16 21:21:39 +000031__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000032 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000033
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000035uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000036 'wais', 'file', 'https', 'shttp', 'mms',
37 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000038uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000039 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
40 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran5e95e762009-03-30 21:51:50 +000041 'svn', 'svn+ssh', 'sftp','nfs']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000042non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000043 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000044uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000045 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000046 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000047uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000048 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000049uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000050 'nntp', 'wais', 'https', 'shttp', 'snews',
51 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000052
53# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000054scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
55 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
56 '0123456789'
57 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000058
Guido van Rossum74495401997-07-14 19:08:15 +000059MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000060_parse_cache = {}
61
62def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000063 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000064 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000065
66
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000067class ResultMixin(object):
68 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000069
70 @property
71 def username(self):
72 netloc = self.netloc
73 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000074 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000075 if ":" in userinfo:
76 userinfo = userinfo.split(":", 1)[0]
77 return userinfo
78 return None
79
80 @property
81 def password(self):
82 netloc = self.netloc
83 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000084 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000085 if ":" in userinfo:
86 return userinfo.split(":", 1)[1]
87 return None
88
89 @property
90 def hostname(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000091 netloc = self.netloc.split('@')[-1]
92 if '[' in netloc and ']' in netloc:
93 return netloc.split(']')[0][1:].lower()
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000094 elif ':' in netloc:
95 return netloc.split(':')[0].lower()
96 elif netloc == '':
97 return None
98 else:
99 return netloc.lower()
Fred Drakead5177c2006-04-01 22:14:43 +0000100
101 @property
102 def port(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000103 netloc = self.netloc.split('@')[-1].split(']')[-1]
104 if ':' in netloc:
105 port = netloc.split(':')[1]
Fred Drakead5177c2006-04-01 22:14:43 +0000106 return int(port, 10)
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000107 else:
108 return None
Fred Drakead5177c2006-04-01 22:14:43 +0000109
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000110from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000111
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000112class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000113
114 __slots__ = ()
115
Fred Drakead5177c2006-04-01 22:14:43 +0000116 def geturl(self):
117 return urlunsplit(self)
118
119
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000120class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000121
122 __slots__ = ()
123
Fred Drakead5177c2006-04-01 22:14:43 +0000124 def geturl(self):
125 return urlunparse(self)
126
127
128def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000129 """Parse a URL into 6 components:
130 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
131 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
132 Note that we don't break the components up in smaller bits
133 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000134 tuple = urlsplit(url, scheme, allow_fragments)
135 scheme, netloc, url, query, fragment = tuple
136 if scheme in uses_params and ';' in url:
137 url, params = _splitparams(url)
138 else:
139 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000140 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000141
142def _splitparams(url):
143 if '/' in url:
144 i = url.find(';', url.rfind('/'))
145 if i < 0:
146 return url, ''
147 else:
148 i = url.find(';')
149 return url[:i], url[i+1:]
150
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000151def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000152 delim = len(url) # position of end of domain part of url, default is end
153 for c in '/?#': # look for delimiters; the order is NOT important
154 wdelim = url.find(c, start) # find first of this delim
155 if wdelim >= 0: # if found
156 delim = min(delim, wdelim) # use earliest delim position
157 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000158
Fred Drakead5177c2006-04-01 22:14:43 +0000159def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000160 """Parse a URL into 5 components:
161 <scheme>://<netloc>/<path>?<query>#<fragment>
162 Return a 5-tuple: (scheme, netloc, path, query, fragment).
163 Note that we don't break the components up in smaller bits
164 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000165 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000166 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000167 cached = _parse_cache.get(key, None)
168 if cached:
169 return cached
170 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
171 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000172 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000173 i = url.find(':')
174 if i > 0:
175 if url[:i] == 'http': # optimize the common case
176 scheme = url[:i].lower()
177 url = url[i+1:]
178 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000179 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000180 if (('[' in netloc and ']' not in netloc) or
181 (']' in netloc and '[' not in netloc)):
182 raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000183 if allow_fragments and '#' in url:
184 url, fragment = url.split('#', 1)
185 if '?' in url:
186 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000187 v = SplitResult(scheme, netloc, url, query, fragment)
188 _parse_cache[key] = v
189 return v
Tim Peterse1190062001-01-15 03:34:38 +0000190 for c in url[:i]:
191 if c not in scheme_chars:
192 break
193 else:
194 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000195
196 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000197 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000198 if (('[' in netloc and ']' not in netloc) or
199 (']' in netloc and '[' not in netloc)):
200 raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000201 if allow_fragments and scheme in uses_fragment and '#' in url:
202 url, fragment = url.split('#', 1)
203 if scheme in uses_query and '?' in url:
204 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000205 v = SplitResult(scheme, netloc, url, query, fragment)
206 _parse_cache[key] = v
207 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000208
Brett Cannon89318d82008-08-03 00:51:02 +0000209def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000210 """Put a parsed URL back together again. This may result in a
211 slightly different, but equivalent URL, if the URL that was parsed
212 originally had redundant delimiters, e.g. a ? with an empty query
213 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000214 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000215 if params:
216 url = "%s;%s" % (url, params)
217 return urlunsplit((scheme, netloc, url, query, fragment))
218
Brett Cannon89318d82008-08-03 00:51:02 +0000219def urlunsplit(data):
220 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000221 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000222 if url and url[:1] != '/': url = '/' + url
223 url = '//' + (netloc or '') + url
224 if scheme:
225 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000226 if query:
227 url = url + '?' + query
228 if fragment:
229 url = url + '#' + fragment
230 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000231
Fred Drakead5177c2006-04-01 22:14:43 +0000232def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000233 """Join a base URL and a possibly relative URL to form an absolute
234 interpretation of the latter."""
235 if not base:
236 return url
237 if not url:
238 return base
239 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
240 urlparse(base, '', allow_fragments)
241 scheme, netloc, path, params, query, fragment = \
242 urlparse(url, bscheme, allow_fragments)
243 if scheme != bscheme or scheme not in uses_relative:
244 return url
245 if scheme in uses_netloc:
246 if netloc:
247 return urlunparse((scheme, netloc, path,
248 params, query, fragment))
249 netloc = bnetloc
250 if path[:1] == '/':
251 return urlunparse((scheme, netloc, path,
252 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000253 if not path:
254 path = bpath
255 if not params:
256 params = bparams
257 else:
258 path = path[:-1]
259 return urlunparse((scheme, netloc, path,
260 params, query, fragment))
261 if not query:
262 query = bquery
263 return urlunparse((scheme, netloc, path,
264 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000265 segments = bpath.split('/')[:-1] + path.split('/')
266 # XXX The stuff below is bogus in various ways...
267 if segments[-1] == '.':
268 segments[-1] = ''
269 while '.' in segments:
270 segments.remove('.')
271 while 1:
272 i = 1
273 n = len(segments) - 1
274 while i < n:
275 if (segments[i] == '..'
276 and segments[i-1] not in ('', '..')):
277 del segments[i-1:i+1]
278 break
279 i = i+1
280 else:
281 break
282 if segments == ['', '..']:
283 segments[-1] = ''
284 elif len(segments) >= 2 and segments[-1] == '..':
285 segments[-2:] = ['']
286 return urlunparse((scheme, netloc, '/'.join(segments),
287 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000288
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000289def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000290 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000291
Tim Peterse1190062001-01-15 03:34:38 +0000292 Returns a tuple of the defragmented URL and the fragment. If
293 the URL contained no fragments, the second element is the
294 empty string.
295 """
Fred Drake5751a222001-11-16 02:52:57 +0000296 if '#' in url:
297 s, n, p, a, q, frag = urlparse(url)
298 defrag = urlunparse((s, n, p, a, q, ''))
299 return defrag, frag
300 else:
301 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000302
Facundo Batistac585df92008-09-03 22:35:50 +0000303# unquote method for parse_qs and parse_qsl
304# Cannot use directly from urllib as it would create circular reference.
305# urllib uses urlparse methods ( urljoin)
306
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000307
308_hexdig = '0123456789ABCDEFabcdef'
309_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000310
311def unquote(s):
312 """unquote('abc%20def') -> 'abc def'."""
313 res = s.split('%')
314 for i in xrange(1, len(res)):
315 item = res[i]
316 try:
317 res[i] = _hextochr[item[:2]] + item[2:]
318 except KeyError:
319 res[i] = '%' + item
320 except UnicodeDecodeError:
321 res[i] = unichr(int(item[:2], 16)) + item[2:]
322 return "".join(res)
323
324def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
325 """Parse a query given as a string argument.
326
327 Arguments:
328
329 qs: URL-encoded query string to be parsed
330
331 keep_blank_values: flag indicating whether blank values in
332 URL encoded queries should be treated as blank strings.
333 A true value indicates that blanks should be retained as
334 blank strings. The default false value indicates that
335 blank values are to be ignored and treated as if they were
336 not included.
337
338 strict_parsing: flag indicating what to do with parsing errors.
339 If false (the default), errors are silently ignored.
340 If true, errors raise a ValueError exception.
341 """
342 dict = {}
343 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
344 if name in dict:
345 dict[name].append(value)
346 else:
347 dict[name] = [value]
348 return dict
349
350def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
351 """Parse a query given as a string argument.
352
353 Arguments:
354
355 qs: URL-encoded query string to be parsed
356
357 keep_blank_values: flag indicating whether blank values in
358 URL encoded queries should be treated as blank strings. A
359 true value indicates that blanks should be retained as blank
360 strings. The default false value indicates that blank values
361 are to be ignored and treated as if they were not included.
362
363 strict_parsing: flag indicating what to do with parsing errors. If
364 false (the default), errors are silently ignored. If true,
365 errors raise a ValueError exception.
366
367 Returns a list, as G-d intended.
368 """
369 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
370 r = []
371 for name_value in pairs:
372 if not name_value and not strict_parsing:
373 continue
374 nv = name_value.split('=', 1)
375 if len(nv) != 2:
376 if strict_parsing:
377 raise ValueError, "bad query field: %r" % (name_value,)
378 # Handle case of a control-name with no equal sign
379 if keep_blank_values:
380 nv.append('')
381 else:
382 continue
383 if len(nv[1]) or keep_blank_values:
384 name = unquote(nv[0].replace('+', ' '))
385 value = unquote(nv[1].replace('+', ' '))
386 r.append((name, value))
387
388 return r
389
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000390
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000391test_input = """
392 http://a/b/c/d
393
394 g:h = <URL:g:h>
395 http:g = <URL:http://a/b/c/g>
396 http: = <URL:http://a/b/c/d>
397 g = <URL:http://a/b/c/g>
398 ./g = <URL:http://a/b/c/g>
399 g/ = <URL:http://a/b/c/g/>
400 /g = <URL:http://a/g>
401 //g = <URL:http://g>
402 ?y = <URL:http://a/b/c/d?y>
403 g?y = <URL:http://a/b/c/g?y>
404 g?y/./x = <URL:http://a/b/c/g?y/./x>
405 . = <URL:http://a/b/c/>
406 ./ = <URL:http://a/b/c/>
407 .. = <URL:http://a/b/>
408 ../ = <URL:http://a/b/>
409 ../g = <URL:http://a/b/g>
410 ../.. = <URL:http://a/>
411 ../../g = <URL:http://a/g>
412 ../../../g = <URL:http://a/../g>
413 ./../g = <URL:http://a/b/g>
414 ./g/. = <URL:http://a/b/c/g/>
415 /./g = <URL:http://a/./g>
416 g/./h = <URL:http://a/b/c/g/h>
417 g/../h = <URL:http://a/b/c/h>
418 http:g = <URL:http://a/b/c/g>
419 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000420 http:?y = <URL:http://a/b/c/d?y>
421 http:g?y = <URL:http://a/b/c/g?y>
422 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000423"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000424
425def test():
Tim Peterse1190062001-01-15 03:34:38 +0000426 import sys
427 base = ''
428 if sys.argv[1:]:
429 fn = sys.argv[1]
430 if fn == '-':
431 fp = sys.stdin
432 else:
433 fp = open(fn)
434 else:
Raymond Hettingera6172712004-12-31 19:15:26 +0000435 try:
436 from cStringIO import StringIO
437 except ImportError:
438 from StringIO import StringIO
439 fp = StringIO(test_input)
Georg Brandl1ea8cb42008-01-21 17:22:06 +0000440 for line in fp:
Tim Peterse1190062001-01-15 03:34:38 +0000441 words = line.split()
442 if not words:
443 continue
444 url = words[0]
445 parts = urlparse(url)
446 print '%-10s : %s' % (url, parts)
447 abs = urljoin(base, url)
448 if not base:
449 base = abs
450 wrapped = '<URL:%s>' % abs
451 print '%-10s = %s' % (url, wrapped)
452 if len(words) == 3 and words[1] == '=':
453 if wrapped != words[2]:
454 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000455
456if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000457 test()