blob: ad7734eb907b9b9f6d593544946a51deadcedaf6 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any changes to urlparse module
23should conform to this. urlparse module is not entirely compliant with this.
24The defacto scenarios of parsing are considered sometimes and for backward
25compatiblity purposes, older RFC uses of parsing are retained. The testcases in
26test_urlparse.py provides a good indicator of parsing behavior.
27
Guido van Rossume7b146f2000-02-04 15:28:42 +000028"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000029
Fred Drakef606e8d2002-10-16 21:21:39 +000030__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000031 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000032
Guido van Rossum23cb2a81994-09-12 10:36:35 +000033# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000034uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000035 'wais', 'file', 'https', 'shttp', 'mms',
36 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000037uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000038 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
39 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
40 'svn', 'svn+ssh', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000041non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000042 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000043uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000044 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000045 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000046uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000047 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000048uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000049 'nntp', 'wais', 'https', 'shttp', 'snews',
50 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000051
52# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000053scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
54 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
55 '0123456789'
56 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000057
Guido van Rossum74495401997-07-14 19:08:15 +000058MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000059_parse_cache = {}
60
61def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000062 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000063 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000064
65
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000066class ResultMixin(object):
67 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000068
69 @property
70 def username(self):
71 netloc = self.netloc
72 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000073 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000074 if ":" in userinfo:
75 userinfo = userinfo.split(":", 1)[0]
76 return userinfo
77 return None
78
79 @property
80 def password(self):
81 netloc = self.netloc
82 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000083 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000084 if ":" in userinfo:
85 return userinfo.split(":", 1)[1]
86 return None
87
88 @property
89 def hostname(self):
90 netloc = self.netloc
91 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000092 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +000093 if ":" in netloc:
94 netloc = netloc.split(":", 1)[0]
95 return netloc.lower() or None
96
97 @property
98 def port(self):
99 netloc = self.netloc
100 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +0000101 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +0000102 if ":" in netloc:
103 port = netloc.split(":", 1)[1]
104 return int(port, 10)
105 return None
106
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000107from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000108
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000109class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000110
111 __slots__ = ()
112
Fred Drakead5177c2006-04-01 22:14:43 +0000113 def geturl(self):
114 return urlunsplit(self)
115
116
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000117class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000118
119 __slots__ = ()
120
Fred Drakead5177c2006-04-01 22:14:43 +0000121 def geturl(self):
122 return urlunparse(self)
123
124
125def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000126 """Parse a URL into 6 components:
127 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
128 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
129 Note that we don't break the components up in smaller bits
130 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000131 tuple = urlsplit(url, scheme, allow_fragments)
132 scheme, netloc, url, query, fragment = tuple
133 if scheme in uses_params and ';' in url:
134 url, params = _splitparams(url)
135 else:
136 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000137 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000138
139def _splitparams(url):
140 if '/' in url:
141 i = url.find(';', url.rfind('/'))
142 if i < 0:
143 return url, ''
144 else:
145 i = url.find(';')
146 return url[:i], url[i+1:]
147
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000148def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000149 delim = len(url) # position of end of domain part of url, default is end
150 for c in '/?#': # look for delimiters; the order is NOT important
151 wdelim = url.find(c, start) # find first of this delim
152 if wdelim >= 0: # if found
153 delim = min(delim, wdelim) # use earliest delim position
154 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000155
Fred Drakead5177c2006-04-01 22:14:43 +0000156def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000157 """Parse a URL into 5 components:
158 <scheme>://<netloc>/<path>?<query>#<fragment>
159 Return a 5-tuple: (scheme, netloc, path, query, fragment).
160 Note that we don't break the components up in smaller bits
161 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000162 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000163 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000164 cached = _parse_cache.get(key, None)
165 if cached:
166 return cached
167 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
168 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000169 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000170 i = url.find(':')
171 if i > 0:
172 if url[:i] == 'http': # optimize the common case
173 scheme = url[:i].lower()
174 url = url[i+1:]
175 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000176 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000177 if allow_fragments and '#' in url:
178 url, fragment = url.split('#', 1)
179 if '?' in url:
180 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000181 v = SplitResult(scheme, netloc, url, query, fragment)
182 _parse_cache[key] = v
183 return v
Tim Peterse1190062001-01-15 03:34:38 +0000184 for c in url[:i]:
185 if c not in scheme_chars:
186 break
187 else:
188 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaranaaa210e2010-02-19 07:39:41 +0000189
190 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000191 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000192 if allow_fragments and scheme in uses_fragment and '#' in url:
193 url, fragment = url.split('#', 1)
194 if scheme in uses_query and '?' in url:
195 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000196 v = SplitResult(scheme, netloc, url, query, fragment)
197 _parse_cache[key] = v
198 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000199
Brett Cannon89318d82008-08-03 00:51:02 +0000200def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000201 """Put a parsed URL back together again. This may result in a
202 slightly different, but equivalent URL, if the URL that was parsed
203 originally had redundant delimiters, e.g. a ? with an empty query
204 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000205 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000206 if params:
207 url = "%s;%s" % (url, params)
208 return urlunsplit((scheme, netloc, url, query, fragment))
209
Brett Cannon89318d82008-08-03 00:51:02 +0000210def urlunsplit(data):
211 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000212 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000213 if url and url[:1] != '/': url = '/' + url
214 url = '//' + (netloc or '') + url
215 if scheme:
216 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000217 if query:
218 url = url + '?' + query
219 if fragment:
220 url = url + '#' + fragment
221 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000222
Fred Drakead5177c2006-04-01 22:14:43 +0000223def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000224 """Join a base URL and a possibly relative URL to form an absolute
225 interpretation of the latter."""
226 if not base:
227 return url
228 if not url:
229 return base
230 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
231 urlparse(base, '', allow_fragments)
232 scheme, netloc, path, params, query, fragment = \
233 urlparse(url, bscheme, allow_fragments)
234 if scheme != bscheme or scheme not in uses_relative:
235 return url
236 if scheme in uses_netloc:
237 if netloc:
238 return urlunparse((scheme, netloc, path,
239 params, query, fragment))
240 netloc = bnetloc
241 if path[:1] == '/':
242 return urlunparse((scheme, netloc, path,
243 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000244 if not path:
245 path = bpath
246 if not params:
247 params = bparams
248 else:
249 path = path[:-1]
250 return urlunparse((scheme, netloc, path,
251 params, query, fragment))
252 if not query:
253 query = bquery
254 return urlunparse((scheme, netloc, path,
255 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000256 segments = bpath.split('/')[:-1] + path.split('/')
257 # XXX The stuff below is bogus in various ways...
258 if segments[-1] == '.':
259 segments[-1] = ''
260 while '.' in segments:
261 segments.remove('.')
262 while 1:
263 i = 1
264 n = len(segments) - 1
265 while i < n:
266 if (segments[i] == '..'
267 and segments[i-1] not in ('', '..')):
268 del segments[i-1:i+1]
269 break
270 i = i+1
271 else:
272 break
273 if segments == ['', '..']:
274 segments[-1] = ''
275 elif len(segments) >= 2 and segments[-1] == '..':
276 segments[-2:] = ['']
277 return urlunparse((scheme, netloc, '/'.join(segments),
278 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000279
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000280def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000281 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000282
Tim Peterse1190062001-01-15 03:34:38 +0000283 Returns a tuple of the defragmented URL and the fragment. If
284 the URL contained no fragments, the second element is the
285 empty string.
286 """
Fred Drake5751a222001-11-16 02:52:57 +0000287 if '#' in url:
288 s, n, p, a, q, frag = urlparse(url)
289 defrag = urlunparse((s, n, p, a, q, ''))
290 return defrag, frag
291 else:
292 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000293
Facundo Batistac585df92008-09-03 22:35:50 +0000294# unquote method for parse_qs and parse_qsl
295# Cannot use directly from urllib as it would create circular reference.
296# urllib uses urlparse methods ( urljoin)
297
Senthil Kumaran34f92772010-03-29 19:30:44 +0000298
299_hexdig = '0123456789ABCDEFabcdef'
300_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000301
302def unquote(s):
303 """unquote('abc%20def') -> 'abc def'."""
304 res = s.split('%')
305 for i in xrange(1, len(res)):
306 item = res[i]
307 try:
308 res[i] = _hextochr[item[:2]] + item[2:]
309 except KeyError:
310 res[i] = '%' + item
311 except UnicodeDecodeError:
312 res[i] = unichr(int(item[:2], 16)) + item[2:]
313 return "".join(res)
314
315def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
316 """Parse a query given as a string argument.
317
318 Arguments:
319
320 qs: URL-encoded query string to be parsed
321
322 keep_blank_values: flag indicating whether blank values in
323 URL encoded queries should be treated as blank strings.
324 A true value indicates that blanks should be retained as
325 blank strings. The default false value indicates that
326 blank values are to be ignored and treated as if they were
327 not included.
328
329 strict_parsing: flag indicating what to do with parsing errors.
330 If false (the default), errors are silently ignored.
331 If true, errors raise a ValueError exception.
332 """
333 dict = {}
334 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
335 if name in dict:
336 dict[name].append(value)
337 else:
338 dict[name] = [value]
339 return dict
340
341def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
342 """Parse a query given as a string argument.
343
344 Arguments:
345
346 qs: URL-encoded query string to be parsed
347
348 keep_blank_values: flag indicating whether blank values in
349 URL encoded queries should be treated as blank strings. A
350 true value indicates that blanks should be retained as blank
351 strings. The default false value indicates that blank values
352 are to be ignored and treated as if they were not included.
353
354 strict_parsing: flag indicating what to do with parsing errors. If
355 false (the default), errors are silently ignored. If true,
356 errors raise a ValueError exception.
357
358 Returns a list, as G-d intended.
359 """
360 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
361 r = []
362 for name_value in pairs:
363 if not name_value and not strict_parsing:
364 continue
365 nv = name_value.split('=', 1)
366 if len(nv) != 2:
367 if strict_parsing:
368 raise ValueError, "bad query field: %r" % (name_value,)
369 # Handle case of a control-name with no equal sign
370 if keep_blank_values:
371 nv.append('')
372 else:
373 continue
374 if len(nv[1]) or keep_blank_values:
375 name = unquote(nv[0].replace('+', ' '))
376 value = unquote(nv[1].replace('+', ' '))
377 r.append((name, value))
378
379 return r
380
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000381
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000382test_input = """
383 http://a/b/c/d
384
385 g:h = <URL:g:h>
386 http:g = <URL:http://a/b/c/g>
387 http: = <URL:http://a/b/c/d>
388 g = <URL:http://a/b/c/g>
389 ./g = <URL:http://a/b/c/g>
390 g/ = <URL:http://a/b/c/g/>
391 /g = <URL:http://a/g>
392 //g = <URL:http://g>
393 ?y = <URL:http://a/b/c/d?y>
394 g?y = <URL:http://a/b/c/g?y>
395 g?y/./x = <URL:http://a/b/c/g?y/./x>
396 . = <URL:http://a/b/c/>
397 ./ = <URL:http://a/b/c/>
398 .. = <URL:http://a/b/>
399 ../ = <URL:http://a/b/>
400 ../g = <URL:http://a/b/g>
401 ../.. = <URL:http://a/>
402 ../../g = <URL:http://a/g>
403 ../../../g = <URL:http://a/../g>
404 ./../g = <URL:http://a/b/g>
405 ./g/. = <URL:http://a/b/c/g/>
406 /./g = <URL:http://a/./g>
407 g/./h = <URL:http://a/b/c/g/h>
408 g/../h = <URL:http://a/b/c/h>
409 http:g = <URL:http://a/b/c/g>
410 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000411 http:?y = <URL:http://a/b/c/d?y>
412 http:g?y = <URL:http://a/b/c/g?y>
413 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000414"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000415
416def test():
Tim Peterse1190062001-01-15 03:34:38 +0000417 import sys
418 base = ''
419 if sys.argv[1:]:
420 fn = sys.argv[1]
421 if fn == '-':
422 fp = sys.stdin
423 else:
424 fp = open(fn)
425 else:
Raymond Hettingera6172712004-12-31 19:15:26 +0000426 try:
427 from cStringIO import StringIO
428 except ImportError:
429 from StringIO import StringIO
430 fp = StringIO(test_input)
Georg Brandl1ea8cb42008-01-21 17:22:06 +0000431 for line in fp:
Tim Peterse1190062001-01-15 03:34:38 +0000432 words = line.split()
433 if not words:
434 continue
435 url = words[0]
436 parts = urlparse(url)
437 print '%-10s : %s' % (url, parts)
438 abs = urljoin(base, url)
439 if not base:
440 base = abs
441 wrapped = '<URL:%s>' % abs
442 print '%-10s = %s' % (url, wrapped)
443 if len(words) == 3 and words[1] == '=':
444 if wrapped != words[2]:
445 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000446
447if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000448 test()