blob: 4c57725ce35fc1a0d6edae873a44669be169d18c [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran420ec8a2010-04-17 14:30:53 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000020McCahill, December 1994
21
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000027test_urlparse.py provides a good indicator of parsing behavior.
28
Guido van Rossume7b146f2000-02-04 15:28:42 +000029"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000030
Fred Drakef606e8d2002-10-16 21:21:39 +000031__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000032 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000033
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000035uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000036 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaranf432aec2011-08-03 18:31:59 +080037 'prospero', 'rtsp', 'rtspu', '', 'sftp',
Senthil Kumaran0df24c92011-08-03 18:40:18 +080038 'svn', 'svn+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000039uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000040 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
41 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran81a04502010-05-13 03:25:21 +000042 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000043uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000044 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000045 'mms', '', 'sftp']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000046
47# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000048scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
49 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
50 '0123456789'
51 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000052
Guido van Rossum74495401997-07-14 19:08:15 +000053MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000054_parse_cache = {}
55
56def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000057 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000058 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000059
60
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000061class ResultMixin(object):
62 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000063
64 @property
65 def username(self):
66 netloc = self.netloc
67 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000068 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000069 if ":" in userinfo:
70 userinfo = userinfo.split(":", 1)[0]
71 return userinfo
72 return None
73
74 @property
75 def password(self):
76 netloc = self.netloc
77 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000078 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000079 if ":" in userinfo:
80 return userinfo.split(":", 1)[1]
81 return None
82
83 @property
84 def hostname(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000085 netloc = self.netloc.split('@')[-1]
86 if '[' in netloc and ']' in netloc:
87 return netloc.split(']')[0][1:].lower()
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000088 elif ':' in netloc:
89 return netloc.split(':')[0].lower()
90 elif netloc == '':
91 return None
92 else:
93 return netloc.lower()
Fred Drakead5177c2006-04-01 22:14:43 +000094
95 @property
96 def port(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000097 netloc = self.netloc.split('@')[-1].split(']')[-1]
98 if ':' in netloc:
99 port = netloc.split(':')[1]
Fred Drakead5177c2006-04-01 22:14:43 +0000100 return int(port, 10)
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000101 else:
102 return None
Fred Drakead5177c2006-04-01 22:14:43 +0000103
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000104from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000105
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000106class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000107
108 __slots__ = ()
109
Fred Drakead5177c2006-04-01 22:14:43 +0000110 def geturl(self):
111 return urlunsplit(self)
112
113
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000114class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000115
116 __slots__ = ()
117
Fred Drakead5177c2006-04-01 22:14:43 +0000118 def geturl(self):
119 return urlunparse(self)
120
121
122def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000123 """Parse a URL into 6 components:
124 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
125 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
126 Note that we don't break the components up in smaller bits
127 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000128 tuple = urlsplit(url, scheme, allow_fragments)
129 scheme, netloc, url, query, fragment = tuple
130 if scheme in uses_params and ';' in url:
131 url, params = _splitparams(url)
132 else:
133 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000134 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000135
136def _splitparams(url):
137 if '/' in url:
138 i = url.find(';', url.rfind('/'))
139 if i < 0:
140 return url, ''
141 else:
142 i = url.find(';')
143 return url[:i], url[i+1:]
144
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000145def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000146 delim = len(url) # position of end of domain part of url, default is end
147 for c in '/?#': # look for delimiters; the order is NOT important
148 wdelim = url.find(c, start) # find first of this delim
149 if wdelim >= 0: # if found
150 delim = min(delim, wdelim) # use earliest delim position
151 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000152
Fred Drakead5177c2006-04-01 22:14:43 +0000153def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000154 """Parse a URL into 5 components:
155 <scheme>://<netloc>/<path>?<query>#<fragment>
156 Return a 5-tuple: (scheme, netloc, path, query, fragment).
157 Note that we don't break the components up in smaller bits
158 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000159 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000160 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000161 cached = _parse_cache.get(key, None)
162 if cached:
163 return cached
164 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
165 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000166 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000167 i = url.find(':')
168 if i > 0:
169 if url[:i] == 'http': # optimize the common case
170 scheme = url[:i].lower()
171 url = url[i+1:]
172 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000173 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000174 if (('[' in netloc and ']' not in netloc) or
175 (']' in netloc and '[' not in netloc)):
176 raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000177 if allow_fragments and '#' in url:
178 url, fragment = url.split('#', 1)
179 if '?' in url:
180 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000181 v = SplitResult(scheme, netloc, url, query, fragment)
182 _parse_cache[key] = v
183 return v
Senthil Kumaranddaea1c2011-04-15 18:07:33 +0800184 for c in url[:i]:
185 if c not in scheme_chars:
186 break
187 else:
Ezio Melotti6d9c1b12012-05-19 17:12:17 +0300188 # make sure "url" is not actually a port number (in which case
189 # "scheme" is really part of the path)
190 rest = url[i+1:]
191 if not rest or any(c not in '0123456789' for c in rest):
192 # not a port number
193 scheme, url = url[:i].lower(), rest
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000194
195 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000196 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000197 if (('[' in netloc and ']' not in netloc) or
198 (']' in netloc and '[' not in netloc)):
199 raise ValueError("Invalid IPv6 URL")
Senthil Kumaranea24dda2012-05-19 08:10:40 +0800200 if allow_fragments and '#' in url:
Fred Drake5751a222001-11-16 02:52:57 +0000201 url, fragment = url.split('#', 1)
Senthil Kumaranea24dda2012-05-19 08:10:40 +0800202 if '?' in url:
Fred Drake5751a222001-11-16 02:52:57 +0000203 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000204 v = SplitResult(scheme, netloc, url, query, fragment)
205 _parse_cache[key] = v
206 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000207
Brett Cannon89318d82008-08-03 00:51:02 +0000208def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000209 """Put a parsed URL back together again. This may result in a
210 slightly different, but equivalent URL, if the URL that was parsed
211 originally had redundant delimiters, e.g. a ? with an empty query
212 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000213 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000214 if params:
215 url = "%s;%s" % (url, params)
216 return urlunsplit((scheme, netloc, url, query, fragment))
217
Brett Cannon89318d82008-08-03 00:51:02 +0000218def urlunsplit(data):
Senthil Kumaran5871a8d2010-06-28 13:56:46 +0000219 """Combine the elements of a tuple as returned by urlsplit() into a
220 complete URL as a string. The data argument can be any five-item iterable.
221 This may result in a slightly different, but equivalent URL, if the URL that
222 was parsed originally had unnecessary delimiters (for example, a ? with an
223 empty query; the RFC states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000224 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000225 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000226 if url and url[:1] != '/': url = '/' + url
227 url = '//' + (netloc or '') + url
228 if scheme:
229 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000230 if query:
231 url = url + '?' + query
232 if fragment:
233 url = url + '#' + fragment
234 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000235
Fred Drakead5177c2006-04-01 22:14:43 +0000236def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000237 """Join a base URL and a possibly relative URL to form an absolute
238 interpretation of the latter."""
239 if not base:
240 return url
241 if not url:
242 return base
243 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
244 urlparse(base, '', allow_fragments)
245 scheme, netloc, path, params, query, fragment = \
246 urlparse(url, bscheme, allow_fragments)
247 if scheme != bscheme or scheme not in uses_relative:
248 return url
249 if scheme in uses_netloc:
250 if netloc:
251 return urlunparse((scheme, netloc, path,
252 params, query, fragment))
253 netloc = bnetloc
254 if path[:1] == '/':
255 return urlunparse((scheme, netloc, path,
256 params, query, fragment))
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000257 if not path and not params:
Facundo Batista67d19812008-08-14 16:51:00 +0000258 path = bpath
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000259 params = bparams
Facundo Batista67d19812008-08-14 16:51:00 +0000260 if not query:
261 query = bquery
262 return urlunparse((scheme, netloc, path,
263 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000264 segments = bpath.split('/')[:-1] + path.split('/')
265 # XXX The stuff below is bogus in various ways...
266 if segments[-1] == '.':
267 segments[-1] = ''
268 while '.' in segments:
269 segments.remove('.')
270 while 1:
271 i = 1
272 n = len(segments) - 1
273 while i < n:
274 if (segments[i] == '..'
275 and segments[i-1] not in ('', '..')):
276 del segments[i-1:i+1]
277 break
278 i = i+1
279 else:
280 break
281 if segments == ['', '..']:
282 segments[-1] = ''
283 elif len(segments) >= 2 and segments[-1] == '..':
284 segments[-2:] = ['']
285 return urlunparse((scheme, netloc, '/'.join(segments),
286 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000287
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000288def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000289 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000290
Tim Peterse1190062001-01-15 03:34:38 +0000291 Returns a tuple of the defragmented URL and the fragment. If
292 the URL contained no fragments, the second element is the
293 empty string.
294 """
Fred Drake5751a222001-11-16 02:52:57 +0000295 if '#' in url:
296 s, n, p, a, q, frag = urlparse(url)
297 defrag = urlunparse((s, n, p, a, q, ''))
298 return defrag, frag
299 else:
300 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000301
Facundo Batistac585df92008-09-03 22:35:50 +0000302# unquote method for parse_qs and parse_qsl
R. David Murraybfbdefe2010-05-25 15:20:46 +0000303# Cannot use directly from urllib as it would create a circular reference
304# because urllib uses urlparse methods (urljoin). If you update this function,
305# update it also in urllib. This code duplication does not existin in Python3.
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000306
307_hexdig = '0123456789ABCDEFabcdef'
R. David Murraybfbdefe2010-05-25 15:20:46 +0000308_hextochr = dict((a+b, chr(int(a+b,16)))
309 for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000310
311def unquote(s):
312 """unquote('abc%20def') -> 'abc def'."""
313 res = s.split('%')
R. David Murraybfbdefe2010-05-25 15:20:46 +0000314 # fastpath
315 if len(res) == 1:
316 return s
317 s = res[0]
318 for item in res[1:]:
Facundo Batistac585df92008-09-03 22:35:50 +0000319 try:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000320 s += _hextochr[item[:2]] + item[2:]
Facundo Batistac585df92008-09-03 22:35:50 +0000321 except KeyError:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000322 s += '%' + item
Facundo Batistac585df92008-09-03 22:35:50 +0000323 except UnicodeDecodeError:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000324 s += unichr(int(item[:2], 16)) + item[2:]
325 return s
Facundo Batistac585df92008-09-03 22:35:50 +0000326
327def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
328 """Parse a query given as a string argument.
329
330 Arguments:
331
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000332 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000333
334 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000335 percent-encoded queries should be treated as blank strings.
Facundo Batistac585df92008-09-03 22:35:50 +0000336 A true value indicates that blanks should be retained as
337 blank strings. The default false value indicates that
338 blank values are to be ignored and treated as if they were
339 not included.
340
341 strict_parsing: flag indicating what to do with parsing errors.
342 If false (the default), errors are silently ignored.
343 If true, errors raise a ValueError exception.
344 """
345 dict = {}
346 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
347 if name in dict:
348 dict[name].append(value)
349 else:
350 dict[name] = [value]
351 return dict
352
353def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
354 """Parse a query given as a string argument.
355
356 Arguments:
357
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000358 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000359
360 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000361 percent-encoded queries should be treated as blank strings. A
Facundo Batistac585df92008-09-03 22:35:50 +0000362 true value indicates that blanks should be retained as blank
363 strings. The default false value indicates that blank values
364 are to be ignored and treated as if they were not included.
365
366 strict_parsing: flag indicating what to do with parsing errors. If
367 false (the default), errors are silently ignored. If true,
368 errors raise a ValueError exception.
369
370 Returns a list, as G-d intended.
371 """
372 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
373 r = []
374 for name_value in pairs:
375 if not name_value and not strict_parsing:
376 continue
377 nv = name_value.split('=', 1)
378 if len(nv) != 2:
379 if strict_parsing:
380 raise ValueError, "bad query field: %r" % (name_value,)
381 # Handle case of a control-name with no equal sign
382 if keep_blank_values:
383 nv.append('')
384 else:
385 continue
386 if len(nv[1]) or keep_blank_values:
387 name = unquote(nv[0].replace('+', ' '))
388 value = unquote(nv[1].replace('+', ' '))
389 r.append((name, value))
390
391 return r