blob: 8c37dff50ca21919f2d33eeb18689d254be36f06 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran420ec8a2010-04-17 14:30:53 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any changes to urlparse module
23should conform to this. urlparse module is not entirely compliant with this.
24The defacto scenarios of parsing are considered sometimes and for backward
25compatiblity purposes, older RFC uses of parsing are retained. The testcases in
26test_urlparse.py provides a good indicator of parsing behavior.
27
Guido van Rossume7b146f2000-02-04 15:28:42 +000028"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000029
Fred Drakef606e8d2002-10-16 21:21:39 +000030__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000031 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000032
Guido van Rossum23cb2a81994-09-12 10:36:35 +000033# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000034uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000035 'wais', 'file', 'https', 'shttp', 'mms',
36 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000037uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000038 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
39 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran5e95e762009-03-30 21:51:50 +000040 'svn', 'svn+ssh', 'sftp','nfs']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000041non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000042 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000043uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000044 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000045 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000046uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000047 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000048uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000049 'nntp', 'wais', 'https', 'shttp', 'snews',
50 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000051
52# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000053scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
54 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
55 '0123456789'
56 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000057
Guido van Rossum74495401997-07-14 19:08:15 +000058MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000059_parse_cache = {}
60
61def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000062 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000063 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000064
65
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000066class ResultMixin(object):
67 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000068
69 @property
70 def username(self):
71 netloc = self.netloc
72 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000073 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000074 if ":" in userinfo:
75 userinfo = userinfo.split(":", 1)[0]
76 return userinfo
77 return None
78
79 @property
80 def password(self):
81 netloc = self.netloc
82 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000083 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000084 if ":" in userinfo:
85 return userinfo.split(":", 1)[1]
86 return None
87
88 @property
89 def hostname(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000090 netloc = self.netloc.split('@')[-1]
91 if '[' in netloc and ']' in netloc:
92 return netloc.split(']')[0][1:].lower()
93 elif '[' in netloc or ']' in netloc:
94 raise ValueError("Invalid IPv6 hostname")
95 elif ':' in netloc:
96 return netloc.split(':')[0].lower()
97 elif netloc == '':
98 return None
99 else:
100 return netloc.lower()
Fred Drakead5177c2006-04-01 22:14:43 +0000101
102 @property
103 def port(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000104 netloc = self.netloc.split('@')[-1].split(']')[-1]
105 if ':' in netloc:
106 port = netloc.split(':')[1]
Fred Drakead5177c2006-04-01 22:14:43 +0000107 return int(port, 10)
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000108 else:
109 return None
Fred Drakead5177c2006-04-01 22:14:43 +0000110
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000111from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000112
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000113class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000114
115 __slots__ = ()
116
Fred Drakead5177c2006-04-01 22:14:43 +0000117 def geturl(self):
118 return urlunsplit(self)
119
120
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000121class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000122
123 __slots__ = ()
124
Fred Drakead5177c2006-04-01 22:14:43 +0000125 def geturl(self):
126 return urlunparse(self)
127
128
129def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000130 """Parse a URL into 6 components:
131 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
132 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
133 Note that we don't break the components up in smaller bits
134 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000135 tuple = urlsplit(url, scheme, allow_fragments)
136 scheme, netloc, url, query, fragment = tuple
137 if scheme in uses_params and ';' in url:
138 url, params = _splitparams(url)
139 else:
140 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000141 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000142
143def _splitparams(url):
144 if '/' in url:
145 i = url.find(';', url.rfind('/'))
146 if i < 0:
147 return url, ''
148 else:
149 i = url.find(';')
150 return url[:i], url[i+1:]
151
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000152def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000153 delim = len(url) # position of end of domain part of url, default is end
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000154 if '[' in url: # check for invalid IPv6 URL
155 if not ']' in url: raise ValueError("Invalid IPv6 URL")
156 elif ']' in url:
157 if not '[' in url: raise ValueError("Invalid IPv6 URL")
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000158 for c in '/?#': # look for delimiters; the order is NOT important
159 wdelim = url.find(c, start) # find first of this delim
160 if wdelim >= 0: # if found
161 delim = min(delim, wdelim) # use earliest delim position
162 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000163
Fred Drakead5177c2006-04-01 22:14:43 +0000164def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000165 """Parse a URL into 5 components:
166 <scheme>://<netloc>/<path>?<query>#<fragment>
167 Return a 5-tuple: (scheme, netloc, path, query, fragment).
168 Note that we don't break the components up in smaller bits
169 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000170 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000171 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000172 cached = _parse_cache.get(key, None)
173 if cached:
174 return cached
175 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
176 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000177 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000178 i = url.find(':')
179 if i > 0:
180 if url[:i] == 'http': # optimize the common case
181 scheme = url[:i].lower()
182 url = url[i+1:]
183 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000184 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000185 if allow_fragments and '#' in url:
186 url, fragment = url.split('#', 1)
187 if '?' in url:
188 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000189 v = SplitResult(scheme, netloc, url, query, fragment)
190 _parse_cache[key] = v
191 return v
Tim Peterse1190062001-01-15 03:34:38 +0000192 for c in url[:i]:
193 if c not in scheme_chars:
194 break
195 else:
196 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000197
198 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000199 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000200 if allow_fragments and scheme in uses_fragment and '#' in url:
201 url, fragment = url.split('#', 1)
202 if scheme in uses_query and '?' in url:
203 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000204 v = SplitResult(scheme, netloc, url, query, fragment)
205 _parse_cache[key] = v
206 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000207
Brett Cannon89318d82008-08-03 00:51:02 +0000208def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000209 """Put a parsed URL back together again. This may result in a
210 slightly different, but equivalent URL, if the URL that was parsed
211 originally had redundant delimiters, e.g. a ? with an empty query
212 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000213 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000214 if params:
215 url = "%s;%s" % (url, params)
216 return urlunsplit((scheme, netloc, url, query, fragment))
217
Brett Cannon89318d82008-08-03 00:51:02 +0000218def urlunsplit(data):
219 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000220 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000221 if url and url[:1] != '/': url = '/' + url
222 url = '//' + (netloc or '') + url
223 if scheme:
224 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000225 if query:
226 url = url + '?' + query
227 if fragment:
228 url = url + '#' + fragment
229 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000230
Fred Drakead5177c2006-04-01 22:14:43 +0000231def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000232 """Join a base URL and a possibly relative URL to form an absolute
233 interpretation of the latter."""
234 if not base:
235 return url
236 if not url:
237 return base
238 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
239 urlparse(base, '', allow_fragments)
240 scheme, netloc, path, params, query, fragment = \
241 urlparse(url, bscheme, allow_fragments)
242 if scheme != bscheme or scheme not in uses_relative:
243 return url
244 if scheme in uses_netloc:
245 if netloc:
246 return urlunparse((scheme, netloc, path,
247 params, query, fragment))
248 netloc = bnetloc
249 if path[:1] == '/':
250 return urlunparse((scheme, netloc, path,
251 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000252 if not path:
253 path = bpath
254 if not params:
255 params = bparams
256 else:
257 path = path[:-1]
258 return urlunparse((scheme, netloc, path,
259 params, query, fragment))
260 if not query:
261 query = bquery
262 return urlunparse((scheme, netloc, path,
263 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000264 segments = bpath.split('/')[:-1] + path.split('/')
265 # XXX The stuff below is bogus in various ways...
266 if segments[-1] == '.':
267 segments[-1] = ''
268 while '.' in segments:
269 segments.remove('.')
270 while 1:
271 i = 1
272 n = len(segments) - 1
273 while i < n:
274 if (segments[i] == '..'
275 and segments[i-1] not in ('', '..')):
276 del segments[i-1:i+1]
277 break
278 i = i+1
279 else:
280 break
281 if segments == ['', '..']:
282 segments[-1] = ''
283 elif len(segments) >= 2 and segments[-1] == '..':
284 segments[-2:] = ['']
285 return urlunparse((scheme, netloc, '/'.join(segments),
286 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000287
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000288def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000289 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000290
Tim Peterse1190062001-01-15 03:34:38 +0000291 Returns a tuple of the defragmented URL and the fragment. If
292 the URL contained no fragments, the second element is the
293 empty string.
294 """
Fred Drake5751a222001-11-16 02:52:57 +0000295 if '#' in url:
296 s, n, p, a, q, frag = urlparse(url)
297 defrag = urlunparse((s, n, p, a, q, ''))
298 return defrag, frag
299 else:
300 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000301
Facundo Batistac585df92008-09-03 22:35:50 +0000302# unquote method for parse_qs and parse_qsl
303# Cannot use directly from urllib as it would create circular reference.
304# urllib uses urlparse methods ( urljoin)
305
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000306
307_hexdig = '0123456789ABCDEFabcdef'
308_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000309
310def unquote(s):
311 """unquote('abc%20def') -> 'abc def'."""
312 res = s.split('%')
313 for i in xrange(1, len(res)):
314 item = res[i]
315 try:
316 res[i] = _hextochr[item[:2]] + item[2:]
317 except KeyError:
318 res[i] = '%' + item
319 except UnicodeDecodeError:
320 res[i] = unichr(int(item[:2], 16)) + item[2:]
321 return "".join(res)
322
323def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
324 """Parse a query given as a string argument.
325
326 Arguments:
327
328 qs: URL-encoded query string to be parsed
329
330 keep_blank_values: flag indicating whether blank values in
331 URL encoded queries should be treated as blank strings.
332 A true value indicates that blanks should be retained as
333 blank strings. The default false value indicates that
334 blank values are to be ignored and treated as if they were
335 not included.
336
337 strict_parsing: flag indicating what to do with parsing errors.
338 If false (the default), errors are silently ignored.
339 If true, errors raise a ValueError exception.
340 """
341 dict = {}
342 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
343 if name in dict:
344 dict[name].append(value)
345 else:
346 dict[name] = [value]
347 return dict
348
349def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
350 """Parse a query given as a string argument.
351
352 Arguments:
353
354 qs: URL-encoded query string to be parsed
355
356 keep_blank_values: flag indicating whether blank values in
357 URL encoded queries should be treated as blank strings. A
358 true value indicates that blanks should be retained as blank
359 strings. The default false value indicates that blank values
360 are to be ignored and treated as if they were not included.
361
362 strict_parsing: flag indicating what to do with parsing errors. If
363 false (the default), errors are silently ignored. If true,
364 errors raise a ValueError exception.
365
366 Returns a list, as G-d intended.
367 """
368 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
369 r = []
370 for name_value in pairs:
371 if not name_value and not strict_parsing:
372 continue
373 nv = name_value.split('=', 1)
374 if len(nv) != 2:
375 if strict_parsing:
376 raise ValueError, "bad query field: %r" % (name_value,)
377 # Handle case of a control-name with no equal sign
378 if keep_blank_values:
379 nv.append('')
380 else:
381 continue
382 if len(nv[1]) or keep_blank_values:
383 name = unquote(nv[0].replace('+', ' '))
384 value = unquote(nv[1].replace('+', ' '))
385 r.append((name, value))
386
387 return r
388
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000389
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000390test_input = """
391 http://a/b/c/d
392
393 g:h = <URL:g:h>
394 http:g = <URL:http://a/b/c/g>
395 http: = <URL:http://a/b/c/d>
396 g = <URL:http://a/b/c/g>
397 ./g = <URL:http://a/b/c/g>
398 g/ = <URL:http://a/b/c/g/>
399 /g = <URL:http://a/g>
400 //g = <URL:http://g>
401 ?y = <URL:http://a/b/c/d?y>
402 g?y = <URL:http://a/b/c/g?y>
403 g?y/./x = <URL:http://a/b/c/g?y/./x>
404 . = <URL:http://a/b/c/>
405 ./ = <URL:http://a/b/c/>
406 .. = <URL:http://a/b/>
407 ../ = <URL:http://a/b/>
408 ../g = <URL:http://a/b/g>
409 ../.. = <URL:http://a/>
410 ../../g = <URL:http://a/g>
411 ../../../g = <URL:http://a/../g>
412 ./../g = <URL:http://a/b/g>
413 ./g/. = <URL:http://a/b/c/g/>
414 /./g = <URL:http://a/./g>
415 g/./h = <URL:http://a/b/c/g/h>
416 g/../h = <URL:http://a/b/c/h>
417 http:g = <URL:http://a/b/c/g>
418 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000419 http:?y = <URL:http://a/b/c/d?y>
420 http:g?y = <URL:http://a/b/c/g?y>
421 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000422"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000423
424def test():
Tim Peterse1190062001-01-15 03:34:38 +0000425 import sys
426 base = ''
427 if sys.argv[1:]:
428 fn = sys.argv[1]
429 if fn == '-':
430 fp = sys.stdin
431 else:
432 fp = open(fn)
433 else:
Raymond Hettingera6172712004-12-31 19:15:26 +0000434 try:
435 from cStringIO import StringIO
436 except ImportError:
437 from StringIO import StringIO
438 fp = StringIO(test_input)
Georg Brandl1ea8cb42008-01-21 17:22:06 +0000439 for line in fp:
Tim Peterse1190062001-01-15 03:34:38 +0000440 words = line.split()
441 if not words:
442 continue
443 url = words[0]
444 parts = urlparse(url)
445 print '%-10s : %s' % (url, parts)
446 abs = urljoin(base, url)
447 if not base:
448 base = abs
449 wrapped = '<URL:%s>' % abs
450 print '%-10s = %s' % (url, wrapped)
451 if len(words) == 3 and words[1] == '=':
452 if wrapped != words[2]:
453 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000454
455if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000456 test()