blob: 0e73e4b97bd79787dd0ab2a08c413beee3fd03af [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +00008RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
9Berners-Lee, R. Fielding, and L. Masinter, August 1998.
10
11RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
12
13RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
141995.
15
16RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
17McCahill, December 1994
18
19RFC 3986 is considered the current standard and any changes to urlparse module
20should conform to this. urlparse module is not entirely compliant with this.
21The defacto scenarios of parsing are considered sometimes and for backward
22compatiblity purposes, older RFC uses of parsing are retained. The testcases in
23test_urlparse.py provides a good indicator of parsing behavior.
24
Guido van Rossume7b146f2000-02-04 15:28:42 +000025"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000026
Fred Drakef606e8d2002-10-16 21:21:39 +000027__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000028 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000029
Guido van Rossum23cb2a81994-09-12 10:36:35 +000030# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000031uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000032 'wais', 'file', 'https', 'shttp', 'mms',
33 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000034uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000035 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
36 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
37 'svn', 'svn+ssh', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000038non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000039 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000040uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000041 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000042 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000043uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000044 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000045uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000046 'nntp', 'wais', 'https', 'shttp', 'snews',
47 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000048
49# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000050scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
51 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
52 '0123456789'
53 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000054
Guido van Rossum74495401997-07-14 19:08:15 +000055MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000056_parse_cache = {}
57
58def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000059 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000060 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000061
62
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000063class ResultMixin(object):
64 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000065
66 @property
67 def username(self):
68 netloc = self.netloc
69 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000070 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000071 if ":" in userinfo:
72 userinfo = userinfo.split(":", 1)[0]
73 return userinfo
74 return None
75
76 @property
77 def password(self):
78 netloc = self.netloc
79 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000080 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000081 if ":" in userinfo:
82 return userinfo.split(":", 1)[1]
83 return None
84
85 @property
86 def hostname(self):
87 netloc = self.netloc
88 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000089 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +000090 if ":" in netloc:
91 netloc = netloc.split(":", 1)[0]
92 return netloc.lower() or None
93
94 @property
95 def port(self):
96 netloc = self.netloc
97 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000098 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +000099 if ":" in netloc:
100 port = netloc.split(":", 1)[1]
101 return int(port, 10)
102 return None
103
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000104from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000105
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000106class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000107
108 __slots__ = ()
109
Fred Drakead5177c2006-04-01 22:14:43 +0000110 def geturl(self):
111 return urlunsplit(self)
112
113
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000114class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000115
116 __slots__ = ()
117
Fred Drakead5177c2006-04-01 22:14:43 +0000118 def geturl(self):
119 return urlunparse(self)
120
121
122def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000123 """Parse a URL into 6 components:
124 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
125 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
126 Note that we don't break the components up in smaller bits
127 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000128 tuple = urlsplit(url, scheme, allow_fragments)
129 scheme, netloc, url, query, fragment = tuple
130 if scheme in uses_params and ';' in url:
131 url, params = _splitparams(url)
132 else:
133 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000134 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000135
136def _splitparams(url):
137 if '/' in url:
138 i = url.find(';', url.rfind('/'))
139 if i < 0:
140 return url, ''
141 else:
142 i = url.find(';')
143 return url[:i], url[i+1:]
144
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000145def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000146 delim = len(url) # position of end of domain part of url, default is end
147 for c in '/?#': # look for delimiters; the order is NOT important
148 wdelim = url.find(c, start) # find first of this delim
149 if wdelim >= 0: # if found
150 delim = min(delim, wdelim) # use earliest delim position
151 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000152
Fred Drakead5177c2006-04-01 22:14:43 +0000153def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000154 """Parse a URL into 5 components:
155 <scheme>://<netloc>/<path>?<query>#<fragment>
156 Return a 5-tuple: (scheme, netloc, path, query, fragment).
157 Note that we don't break the components up in smaller bits
158 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000159 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000160 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000161 cached = _parse_cache.get(key, None)
162 if cached:
163 return cached
164 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
165 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000166 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000167 i = url.find(':')
168 if i > 0:
169 if url[:i] == 'http': # optimize the common case
170 scheme = url[:i].lower()
171 url = url[i+1:]
172 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000173 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000174 if allow_fragments and '#' in url:
175 url, fragment = url.split('#', 1)
176 if '?' in url:
177 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000178 v = SplitResult(scheme, netloc, url, query, fragment)
179 _parse_cache[key] = v
180 return v
Tim Peterse1190062001-01-15 03:34:38 +0000181 for c in url[:i]:
182 if c not in scheme_chars:
183 break
184 else:
185 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaranaaa210e2010-02-19 07:39:41 +0000186
187 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000188 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000189 if allow_fragments and scheme in uses_fragment and '#' in url:
190 url, fragment = url.split('#', 1)
191 if scheme in uses_query and '?' in url:
192 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000193 v = SplitResult(scheme, netloc, url, query, fragment)
194 _parse_cache[key] = v
195 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000196
Brett Cannon89318d82008-08-03 00:51:02 +0000197def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000198 """Put a parsed URL back together again. This may result in a
199 slightly different, but equivalent URL, if the URL that was parsed
200 originally had redundant delimiters, e.g. a ? with an empty query
201 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000202 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000203 if params:
204 url = "%s;%s" % (url, params)
205 return urlunsplit((scheme, netloc, url, query, fragment))
206
Brett Cannon89318d82008-08-03 00:51:02 +0000207def urlunsplit(data):
208 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000209 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000210 if url and url[:1] != '/': url = '/' + url
211 url = '//' + (netloc or '') + url
212 if scheme:
213 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000214 if query:
215 url = url + '?' + query
216 if fragment:
217 url = url + '#' + fragment
218 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000219
Fred Drakead5177c2006-04-01 22:14:43 +0000220def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000221 """Join a base URL and a possibly relative URL to form an absolute
222 interpretation of the latter."""
223 if not base:
224 return url
225 if not url:
226 return base
227 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
228 urlparse(base, '', allow_fragments)
229 scheme, netloc, path, params, query, fragment = \
230 urlparse(url, bscheme, allow_fragments)
231 if scheme != bscheme or scheme not in uses_relative:
232 return url
233 if scheme in uses_netloc:
234 if netloc:
235 return urlunparse((scheme, netloc, path,
236 params, query, fragment))
237 netloc = bnetloc
238 if path[:1] == '/':
239 return urlunparse((scheme, netloc, path,
240 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000241 if not path:
242 path = bpath
243 if not params:
244 params = bparams
245 else:
246 path = path[:-1]
247 return urlunparse((scheme, netloc, path,
248 params, query, fragment))
249 if not query:
250 query = bquery
251 return urlunparse((scheme, netloc, path,
252 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000253 segments = bpath.split('/')[:-1] + path.split('/')
254 # XXX The stuff below is bogus in various ways...
255 if segments[-1] == '.':
256 segments[-1] = ''
257 while '.' in segments:
258 segments.remove('.')
259 while 1:
260 i = 1
261 n = len(segments) - 1
262 while i < n:
263 if (segments[i] == '..'
264 and segments[i-1] not in ('', '..')):
265 del segments[i-1:i+1]
266 break
267 i = i+1
268 else:
269 break
270 if segments == ['', '..']:
271 segments[-1] = ''
272 elif len(segments) >= 2 and segments[-1] == '..':
273 segments[-2:] = ['']
274 return urlunparse((scheme, netloc, '/'.join(segments),
275 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000276
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000277def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000278 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000279
Tim Peterse1190062001-01-15 03:34:38 +0000280 Returns a tuple of the defragmented URL and the fragment. If
281 the URL contained no fragments, the second element is the
282 empty string.
283 """
Fred Drake5751a222001-11-16 02:52:57 +0000284 if '#' in url:
285 s, n, p, a, q, frag = urlparse(url)
286 defrag = urlunparse((s, n, p, a, q, ''))
287 return defrag, frag
288 else:
289 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000290
Facundo Batistac585df92008-09-03 22:35:50 +0000291# unquote method for parse_qs and parse_qsl
292# Cannot use directly from urllib as it would create circular reference.
293# urllib uses urlparse methods ( urljoin)
294
Senthil Kumaran34f92772010-03-29 19:30:44 +0000295
296_hexdig = '0123456789ABCDEFabcdef'
297_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000298
299def unquote(s):
300 """unquote('abc%20def') -> 'abc def'."""
301 res = s.split('%')
302 for i in xrange(1, len(res)):
303 item = res[i]
304 try:
305 res[i] = _hextochr[item[:2]] + item[2:]
306 except KeyError:
307 res[i] = '%' + item
308 except UnicodeDecodeError:
309 res[i] = unichr(int(item[:2], 16)) + item[2:]
310 return "".join(res)
311
312def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
313 """Parse a query given as a string argument.
314
315 Arguments:
316
317 qs: URL-encoded query string to be parsed
318
319 keep_blank_values: flag indicating whether blank values in
320 URL encoded queries should be treated as blank strings.
321 A true value indicates that blanks should be retained as
322 blank strings. The default false value indicates that
323 blank values are to be ignored and treated as if they were
324 not included.
325
326 strict_parsing: flag indicating what to do with parsing errors.
327 If false (the default), errors are silently ignored.
328 If true, errors raise a ValueError exception.
329 """
330 dict = {}
331 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
332 if name in dict:
333 dict[name].append(value)
334 else:
335 dict[name] = [value]
336 return dict
337
338def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
339 """Parse a query given as a string argument.
340
341 Arguments:
342
343 qs: URL-encoded query string to be parsed
344
345 keep_blank_values: flag indicating whether blank values in
346 URL encoded queries should be treated as blank strings. A
347 true value indicates that blanks should be retained as blank
348 strings. The default false value indicates that blank values
349 are to be ignored and treated as if they were not included.
350
351 strict_parsing: flag indicating what to do with parsing errors. If
352 false (the default), errors are silently ignored. If true,
353 errors raise a ValueError exception.
354
355 Returns a list, as G-d intended.
356 """
357 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
358 r = []
359 for name_value in pairs:
360 if not name_value and not strict_parsing:
361 continue
362 nv = name_value.split('=', 1)
363 if len(nv) != 2:
364 if strict_parsing:
365 raise ValueError, "bad query field: %r" % (name_value,)
366 # Handle case of a control-name with no equal sign
367 if keep_blank_values:
368 nv.append('')
369 else:
370 continue
371 if len(nv[1]) or keep_blank_values:
372 name = unquote(nv[0].replace('+', ' '))
373 value = unquote(nv[1].replace('+', ' '))
374 r.append((name, value))
375
376 return r
377
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000378
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000379test_input = """
380 http://a/b/c/d
381
382 g:h = <URL:g:h>
383 http:g = <URL:http://a/b/c/g>
384 http: = <URL:http://a/b/c/d>
385 g = <URL:http://a/b/c/g>
386 ./g = <URL:http://a/b/c/g>
387 g/ = <URL:http://a/b/c/g/>
388 /g = <URL:http://a/g>
389 //g = <URL:http://g>
390 ?y = <URL:http://a/b/c/d?y>
391 g?y = <URL:http://a/b/c/g?y>
392 g?y/./x = <URL:http://a/b/c/g?y/./x>
393 . = <URL:http://a/b/c/>
394 ./ = <URL:http://a/b/c/>
395 .. = <URL:http://a/b/>
396 ../ = <URL:http://a/b/>
397 ../g = <URL:http://a/b/g>
398 ../.. = <URL:http://a/>
399 ../../g = <URL:http://a/g>
400 ../../../g = <URL:http://a/../g>
401 ./../g = <URL:http://a/b/g>
402 ./g/. = <URL:http://a/b/c/g/>
403 /./g = <URL:http://a/./g>
404 g/./h = <URL:http://a/b/c/g/h>
405 g/../h = <URL:http://a/b/c/h>
406 http:g = <URL:http://a/b/c/g>
407 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000408 http:?y = <URL:http://a/b/c/d?y>
409 http:g?y = <URL:http://a/b/c/g?y>
410 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000411"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000412
413def test():
Tim Peterse1190062001-01-15 03:34:38 +0000414 import sys
415 base = ''
416 if sys.argv[1:]:
417 fn = sys.argv[1]
418 if fn == '-':
419 fp = sys.stdin
420 else:
421 fp = open(fn)
422 else:
Raymond Hettingera6172712004-12-31 19:15:26 +0000423 try:
424 from cStringIO import StringIO
425 except ImportError:
426 from StringIO import StringIO
427 fp = StringIO(test_input)
Georg Brandl1ea8cb42008-01-21 17:22:06 +0000428 for line in fp:
Tim Peterse1190062001-01-15 03:34:38 +0000429 words = line.split()
430 if not words:
431 continue
432 url = words[0]
433 parts = urlparse(url)
434 print '%-10s : %s' % (url, parts)
435 abs = urljoin(base, url)
436 if not base:
437 base = abs
438 wrapped = '<URL:%s>' % abs
439 print '%-10s = %s' % (url, wrapped)
440 if len(words) == 3 and words[1] == '=':
441 if wrapped != words[2]:
442 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000443
444if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000445 test()