blob: 1a8151804d481bd468a957243a3b478599c8e934 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran420ec8a2010-04-17 14:30:53 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any changes to urlparse module
23should conform to this. urlparse module is not entirely compliant with this.
24The defacto scenarios of parsing are considered sometimes and for backward
25compatiblity purposes, older RFC uses of parsing are retained. The testcases in
26test_urlparse.py provides a good indicator of parsing behavior.
27
Guido van Rossume7b146f2000-02-04 15:28:42 +000028"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000029
Fred Drakef606e8d2002-10-16 21:21:39 +000030__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000031 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000032
Guido van Rossum23cb2a81994-09-12 10:36:35 +000033# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000034uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000035 'wais', 'file', 'https', 'shttp', 'mms',
36 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000037uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000038 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
39 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran5e95e762009-03-30 21:51:50 +000040 'svn', 'svn+ssh', 'sftp','nfs']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000041non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000042 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000043uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000044 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000045 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000046uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000047 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000048uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000049 'nntp', 'wais', 'https', 'shttp', 'snews',
50 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000051
52# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000053scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
54 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
55 '0123456789'
56 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000057
Guido van Rossum74495401997-07-14 19:08:15 +000058MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000059_parse_cache = {}
60
61def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000062 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000063 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000064
65
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000066class ResultMixin(object):
67 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000068
69 @property
70 def username(self):
71 netloc = self.netloc
72 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000073 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000074 if ":" in userinfo:
75 userinfo = userinfo.split(":", 1)[0]
76 return userinfo
77 return None
78
79 @property
80 def password(self):
81 netloc = self.netloc
82 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000083 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000084 if ":" in userinfo:
85 return userinfo.split(":", 1)[1]
86 return None
87
88 @property
89 def hostname(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000090 netloc = self.netloc.split('@')[-1]
91 if '[' in netloc and ']' in netloc:
92 return netloc.split(']')[0][1:].lower()
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000093 elif ':' in netloc:
94 return netloc.split(':')[0].lower()
95 elif netloc == '':
96 return None
97 else:
98 return netloc.lower()
Fred Drakead5177c2006-04-01 22:14:43 +000099
100 @property
101 def port(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000102 netloc = self.netloc.split('@')[-1].split(']')[-1]
103 if ':' in netloc:
104 port = netloc.split(':')[1]
Fred Drakead5177c2006-04-01 22:14:43 +0000105 return int(port, 10)
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000106 else:
107 return None
Fred Drakead5177c2006-04-01 22:14:43 +0000108
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000109from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000110
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000111class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000112
113 __slots__ = ()
114
Fred Drakead5177c2006-04-01 22:14:43 +0000115 def geturl(self):
116 return urlunsplit(self)
117
118
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000119class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000120
121 __slots__ = ()
122
Fred Drakead5177c2006-04-01 22:14:43 +0000123 def geturl(self):
124 return urlunparse(self)
125
126
127def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000128 """Parse a URL into 6 components:
129 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
130 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
131 Note that we don't break the components up in smaller bits
132 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000133 tuple = urlsplit(url, scheme, allow_fragments)
134 scheme, netloc, url, query, fragment = tuple
135 if scheme in uses_params and ';' in url:
136 url, params = _splitparams(url)
137 else:
138 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000139 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000140
141def _splitparams(url):
142 if '/' in url:
143 i = url.find(';', url.rfind('/'))
144 if i < 0:
145 return url, ''
146 else:
147 i = url.find(';')
148 return url[:i], url[i+1:]
149
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000150def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000151 delim = len(url) # position of end of domain part of url, default is end
152 for c in '/?#': # look for delimiters; the order is NOT important
153 wdelim = url.find(c, start) # find first of this delim
154 if wdelim >= 0: # if found
155 delim = min(delim, wdelim) # use earliest delim position
156 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000157
Fred Drakead5177c2006-04-01 22:14:43 +0000158def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000159 """Parse a URL into 5 components:
160 <scheme>://<netloc>/<path>?<query>#<fragment>
161 Return a 5-tuple: (scheme, netloc, path, query, fragment).
162 Note that we don't break the components up in smaller bits
163 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000164 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000165 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000166 cached = _parse_cache.get(key, None)
167 if cached:
168 return cached
169 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
170 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000171 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000172 i = url.find(':')
173 if i > 0:
174 if url[:i] == 'http': # optimize the common case
175 scheme = url[:i].lower()
176 url = url[i+1:]
177 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000178 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran241a0432010-04-20 20:37:59 +0000179 if '[' in netloc :
180 if not ']' in netloc: raise ValueError("Invalid IPv6 URL")
181 if ']' in netloc:
182 if not '[' in netloc: raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000183 if allow_fragments and '#' in url:
184 url, fragment = url.split('#', 1)
185 if '?' in url:
186 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000187 v = SplitResult(scheme, netloc, url, query, fragment)
188 _parse_cache[key] = v
189 return v
Tim Peterse1190062001-01-15 03:34:38 +0000190 for c in url[:i]:
191 if c not in scheme_chars:
192 break
193 else:
194 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000195
196 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000197 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran241a0432010-04-20 20:37:59 +0000198 if '[' in netloc:
199 if not ']' in netloc: raise ValueError("Invalid IPv6 URL")
200 if ']' in netloc:
201 if not '[' in netloc: raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000202 if allow_fragments and scheme in uses_fragment and '#' in url:
203 url, fragment = url.split('#', 1)
204 if scheme in uses_query and '?' in url:
205 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000206 v = SplitResult(scheme, netloc, url, query, fragment)
207 _parse_cache[key] = v
208 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000209
Brett Cannon89318d82008-08-03 00:51:02 +0000210def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000211 """Put a parsed URL back together again. This may result in a
212 slightly different, but equivalent URL, if the URL that was parsed
213 originally had redundant delimiters, e.g. a ? with an empty query
214 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000215 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000216 if params:
217 url = "%s;%s" % (url, params)
218 return urlunsplit((scheme, netloc, url, query, fragment))
219
Brett Cannon89318d82008-08-03 00:51:02 +0000220def urlunsplit(data):
221 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000222 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000223 if url and url[:1] != '/': url = '/' + url
224 url = '//' + (netloc or '') + url
225 if scheme:
226 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000227 if query:
228 url = url + '?' + query
229 if fragment:
230 url = url + '#' + fragment
231 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000232
Fred Drakead5177c2006-04-01 22:14:43 +0000233def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000234 """Join a base URL and a possibly relative URL to form an absolute
235 interpretation of the latter."""
236 if not base:
237 return url
238 if not url:
239 return base
240 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
241 urlparse(base, '', allow_fragments)
242 scheme, netloc, path, params, query, fragment = \
243 urlparse(url, bscheme, allow_fragments)
244 if scheme != bscheme or scheme not in uses_relative:
245 return url
246 if scheme in uses_netloc:
247 if netloc:
248 return urlunparse((scheme, netloc, path,
249 params, query, fragment))
250 netloc = bnetloc
251 if path[:1] == '/':
252 return urlunparse((scheme, netloc, path,
253 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000254 if not path:
255 path = bpath
256 if not params:
257 params = bparams
258 else:
259 path = path[:-1]
260 return urlunparse((scheme, netloc, path,
261 params, query, fragment))
262 if not query:
263 query = bquery
264 return urlunparse((scheme, netloc, path,
265 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000266 segments = bpath.split('/')[:-1] + path.split('/')
267 # XXX The stuff below is bogus in various ways...
268 if segments[-1] == '.':
269 segments[-1] = ''
270 while '.' in segments:
271 segments.remove('.')
272 while 1:
273 i = 1
274 n = len(segments) - 1
275 while i < n:
276 if (segments[i] == '..'
277 and segments[i-1] not in ('', '..')):
278 del segments[i-1:i+1]
279 break
280 i = i+1
281 else:
282 break
283 if segments == ['', '..']:
284 segments[-1] = ''
285 elif len(segments) >= 2 and segments[-1] == '..':
286 segments[-2:] = ['']
287 return urlunparse((scheme, netloc, '/'.join(segments),
288 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000289
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000290def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000291 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000292
Tim Peterse1190062001-01-15 03:34:38 +0000293 Returns a tuple of the defragmented URL and the fragment. If
294 the URL contained no fragments, the second element is the
295 empty string.
296 """
Fred Drake5751a222001-11-16 02:52:57 +0000297 if '#' in url:
298 s, n, p, a, q, frag = urlparse(url)
299 defrag = urlunparse((s, n, p, a, q, ''))
300 return defrag, frag
301 else:
302 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000303
Facundo Batistac585df92008-09-03 22:35:50 +0000304# unquote method for parse_qs and parse_qsl
305# Cannot use directly from urllib as it would create circular reference.
306# urllib uses urlparse methods ( urljoin)
307
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000308
309_hexdig = '0123456789ABCDEFabcdef'
310_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000311
312def unquote(s):
313 """unquote('abc%20def') -> 'abc def'."""
314 res = s.split('%')
315 for i in xrange(1, len(res)):
316 item = res[i]
317 try:
318 res[i] = _hextochr[item[:2]] + item[2:]
319 except KeyError:
320 res[i] = '%' + item
321 except UnicodeDecodeError:
322 res[i] = unichr(int(item[:2], 16)) + item[2:]
323 return "".join(res)
324
325def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
326 """Parse a query given as a string argument.
327
328 Arguments:
329
330 qs: URL-encoded query string to be parsed
331
332 keep_blank_values: flag indicating whether blank values in
333 URL encoded queries should be treated as blank strings.
334 A true value indicates that blanks should be retained as
335 blank strings. The default false value indicates that
336 blank values are to be ignored and treated as if they were
337 not included.
338
339 strict_parsing: flag indicating what to do with parsing errors.
340 If false (the default), errors are silently ignored.
341 If true, errors raise a ValueError exception.
342 """
343 dict = {}
344 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
345 if name in dict:
346 dict[name].append(value)
347 else:
348 dict[name] = [value]
349 return dict
350
351def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
352 """Parse a query given as a string argument.
353
354 Arguments:
355
356 qs: URL-encoded query string to be parsed
357
358 keep_blank_values: flag indicating whether blank values in
359 URL encoded queries should be treated as blank strings. A
360 true value indicates that blanks should be retained as blank
361 strings. The default false value indicates that blank values
362 are to be ignored and treated as if they were not included.
363
364 strict_parsing: flag indicating what to do with parsing errors. If
365 false (the default), errors are silently ignored. If true,
366 errors raise a ValueError exception.
367
368 Returns a list, as G-d intended.
369 """
370 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
371 r = []
372 for name_value in pairs:
373 if not name_value and not strict_parsing:
374 continue
375 nv = name_value.split('=', 1)
376 if len(nv) != 2:
377 if strict_parsing:
378 raise ValueError, "bad query field: %r" % (name_value,)
379 # Handle case of a control-name with no equal sign
380 if keep_blank_values:
381 nv.append('')
382 else:
383 continue
384 if len(nv[1]) or keep_blank_values:
385 name = unquote(nv[0].replace('+', ' '))
386 value = unquote(nv[1].replace('+', ' '))
387 r.append((name, value))
388
389 return r
390
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000391
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000392test_input = """
393 http://a/b/c/d
394
395 g:h = <URL:g:h>
396 http:g = <URL:http://a/b/c/g>
397 http: = <URL:http://a/b/c/d>
398 g = <URL:http://a/b/c/g>
399 ./g = <URL:http://a/b/c/g>
400 g/ = <URL:http://a/b/c/g/>
401 /g = <URL:http://a/g>
402 //g = <URL:http://g>
403 ?y = <URL:http://a/b/c/d?y>
404 g?y = <URL:http://a/b/c/g?y>
405 g?y/./x = <URL:http://a/b/c/g?y/./x>
406 . = <URL:http://a/b/c/>
407 ./ = <URL:http://a/b/c/>
408 .. = <URL:http://a/b/>
409 ../ = <URL:http://a/b/>
410 ../g = <URL:http://a/b/g>
411 ../.. = <URL:http://a/>
412 ../../g = <URL:http://a/g>
413 ../../../g = <URL:http://a/../g>
414 ./../g = <URL:http://a/b/g>
415 ./g/. = <URL:http://a/b/c/g/>
416 /./g = <URL:http://a/./g>
417 g/./h = <URL:http://a/b/c/g/h>
418 g/../h = <URL:http://a/b/c/h>
419 http:g = <URL:http://a/b/c/g>
420 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000421 http:?y = <URL:http://a/b/c/d?y>
422 http:g?y = <URL:http://a/b/c/g?y>
423 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000424"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000425
426def test():
Tim Peterse1190062001-01-15 03:34:38 +0000427 import sys
428 base = ''
429 if sys.argv[1:]:
430 fn = sys.argv[1]
431 if fn == '-':
432 fp = sys.stdin
433 else:
434 fp = open(fn)
435 else:
Raymond Hettingera6172712004-12-31 19:15:26 +0000436 try:
437 from cStringIO import StringIO
438 except ImportError:
439 from StringIO import StringIO
440 fp = StringIO(test_input)
Georg Brandl1ea8cb42008-01-21 17:22:06 +0000441 for line in fp:
Tim Peterse1190062001-01-15 03:34:38 +0000442 words = line.split()
443 if not words:
444 continue
445 url = words[0]
446 parts = urlparse(url)
447 print '%-10s : %s' % (url, parts)
448 abs = urljoin(base, url)
449 if not base:
450 base = abs
451 wrapped = '<URL:%s>' % abs
452 print '%-10s = %s' % (url, wrapped)
453 if len(words) == 3 and words[1] == '=':
454 if wrapped != words[2]:
455 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000456
457if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000458 test()