blob: b42e0f4c621774ee23c52c3db719671b8c314022 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran420ec8a2010-04-17 14:30:53 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000020McCahill, December 1994
21
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000027test_urlparse.py provides a good indicator of parsing behavior.
28
Guido van Rossume7b146f2000-02-04 15:28:42 +000029"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000030
Fred Drakef606e8d2002-10-16 21:21:39 +000031__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000032 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000033
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000035uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000036 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaranf432aec2011-08-03 18:31:59 +080037 'prospero', 'rtsp', 'rtspu', '', 'sftp',
Senthil Kumaran0df24c92011-08-03 18:40:18 +080038 'svn', 'svn+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000039uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000040 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
41 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran81a04502010-05-13 03:25:21 +000042 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000043non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000044 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000045uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000046 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000047 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000048uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000049 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000050uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000051 'nntp', 'wais', 'https', 'shttp', 'snews',
52 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000053
54# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000055scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
56 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
57 '0123456789'
58 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000059
Guido van Rossum74495401997-07-14 19:08:15 +000060MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000061_parse_cache = {}
62
63def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000064 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000065 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000066
67
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000068class ResultMixin(object):
69 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000070
71 @property
72 def username(self):
73 netloc = self.netloc
74 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000075 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000076 if ":" in userinfo:
77 userinfo = userinfo.split(":", 1)[0]
78 return userinfo
79 return None
80
81 @property
82 def password(self):
83 netloc = self.netloc
84 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000085 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000086 if ":" in userinfo:
87 return userinfo.split(":", 1)[1]
88 return None
89
90 @property
91 def hostname(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000092 netloc = self.netloc.split('@')[-1]
93 if '[' in netloc and ']' in netloc:
94 return netloc.split(']')[0][1:].lower()
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000095 elif ':' in netloc:
96 return netloc.split(':')[0].lower()
97 elif netloc == '':
98 return None
99 else:
100 return netloc.lower()
Fred Drakead5177c2006-04-01 22:14:43 +0000101
102 @property
103 def port(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000104 netloc = self.netloc.split('@')[-1].split(']')[-1]
105 if ':' in netloc:
106 port = netloc.split(':')[1]
Fred Drakead5177c2006-04-01 22:14:43 +0000107 return int(port, 10)
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000108 else:
109 return None
Fred Drakead5177c2006-04-01 22:14:43 +0000110
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000111from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000112
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000113class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000114
115 __slots__ = ()
116
Fred Drakead5177c2006-04-01 22:14:43 +0000117 def geturl(self):
118 return urlunsplit(self)
119
120
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000121class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000122
123 __slots__ = ()
124
Fred Drakead5177c2006-04-01 22:14:43 +0000125 def geturl(self):
126 return urlunparse(self)
127
128
129def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000130 """Parse a URL into 6 components:
131 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
132 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
133 Note that we don't break the components up in smaller bits
134 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000135 tuple = urlsplit(url, scheme, allow_fragments)
136 scheme, netloc, url, query, fragment = tuple
137 if scheme in uses_params and ';' in url:
138 url, params = _splitparams(url)
139 else:
140 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000141 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000142
143def _splitparams(url):
144 if '/' in url:
145 i = url.find(';', url.rfind('/'))
146 if i < 0:
147 return url, ''
148 else:
149 i = url.find(';')
150 return url[:i], url[i+1:]
151
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000152def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000153 delim = len(url) # position of end of domain part of url, default is end
154 for c in '/?#': # look for delimiters; the order is NOT important
155 wdelim = url.find(c, start) # find first of this delim
156 if wdelim >= 0: # if found
157 delim = min(delim, wdelim) # use earliest delim position
158 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000159
Fred Drakead5177c2006-04-01 22:14:43 +0000160def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000161 """Parse a URL into 5 components:
162 <scheme>://<netloc>/<path>?<query>#<fragment>
163 Return a 5-tuple: (scheme, netloc, path, query, fragment).
164 Note that we don't break the components up in smaller bits
165 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000166 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000167 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000168 cached = _parse_cache.get(key, None)
169 if cached:
170 return cached
171 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
172 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000173 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000174 i = url.find(':')
175 if i > 0:
176 if url[:i] == 'http': # optimize the common case
177 scheme = url[:i].lower()
178 url = url[i+1:]
179 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000180 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000181 if (('[' in netloc and ']' not in netloc) or
182 (']' in netloc and '[' not in netloc)):
183 raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000184 if allow_fragments and '#' in url:
185 url, fragment = url.split('#', 1)
186 if '?' in url:
187 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000188 v = SplitResult(scheme, netloc, url, query, fragment)
189 _parse_cache[key] = v
190 return v
Senthil Kumaranddaea1c2011-04-15 18:07:33 +0800191 for c in url[:i]:
192 if c not in scheme_chars:
193 break
194 else:
195 try:
196 # make sure "url" is not actually a port number (in which case
197 # "scheme" is really part of the path
198 _testportnum = int(url[i+1:])
199 except ValueError:
Senthil Kumaran0b5019f2010-08-04 04:45:31 +0000200 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000201
202 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000203 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000204 if (('[' in netloc and ']' not in netloc) or
205 (']' in netloc and '[' not in netloc)):
206 raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000207 if allow_fragments and scheme in uses_fragment and '#' in url:
208 url, fragment = url.split('#', 1)
209 if scheme in uses_query and '?' in url:
210 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000211 v = SplitResult(scheme, netloc, url, query, fragment)
212 _parse_cache[key] = v
213 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000214
Brett Cannon89318d82008-08-03 00:51:02 +0000215def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000216 """Put a parsed URL back together again. This may result in a
217 slightly different, but equivalent URL, if the URL that was parsed
218 originally had redundant delimiters, e.g. a ? with an empty query
219 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000220 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000221 if params:
222 url = "%s;%s" % (url, params)
223 return urlunsplit((scheme, netloc, url, query, fragment))
224
Brett Cannon89318d82008-08-03 00:51:02 +0000225def urlunsplit(data):
Senthil Kumaran5871a8d2010-06-28 13:56:46 +0000226 """Combine the elements of a tuple as returned by urlsplit() into a
227 complete URL as a string. The data argument can be any five-item iterable.
228 This may result in a slightly different, but equivalent URL, if the URL that
229 was parsed originally had unnecessary delimiters (for example, a ? with an
230 empty query; the RFC states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000231 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000232 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000233 if url and url[:1] != '/': url = '/' + url
234 url = '//' + (netloc or '') + url
235 if scheme:
236 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000237 if query:
238 url = url + '?' + query
239 if fragment:
240 url = url + '#' + fragment
241 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000242
Fred Drakead5177c2006-04-01 22:14:43 +0000243def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000244 """Join a base URL and a possibly relative URL to form an absolute
245 interpretation of the latter."""
246 if not base:
247 return url
248 if not url:
249 return base
250 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
251 urlparse(base, '', allow_fragments)
252 scheme, netloc, path, params, query, fragment = \
253 urlparse(url, bscheme, allow_fragments)
254 if scheme != bscheme or scheme not in uses_relative:
255 return url
256 if scheme in uses_netloc:
257 if netloc:
258 return urlunparse((scheme, netloc, path,
259 params, query, fragment))
260 netloc = bnetloc
261 if path[:1] == '/':
262 return urlunparse((scheme, netloc, path,
263 params, query, fragment))
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000264 if not path and not params:
Facundo Batista67d19812008-08-14 16:51:00 +0000265 path = bpath
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000266 params = bparams
Facundo Batista67d19812008-08-14 16:51:00 +0000267 if not query:
268 query = bquery
269 return urlunparse((scheme, netloc, path,
270 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000271 segments = bpath.split('/')[:-1] + path.split('/')
272 # XXX The stuff below is bogus in various ways...
273 if segments[-1] == '.':
274 segments[-1] = ''
275 while '.' in segments:
276 segments.remove('.')
277 while 1:
278 i = 1
279 n = len(segments) - 1
280 while i < n:
281 if (segments[i] == '..'
282 and segments[i-1] not in ('', '..')):
283 del segments[i-1:i+1]
284 break
285 i = i+1
286 else:
287 break
288 if segments == ['', '..']:
289 segments[-1] = ''
290 elif len(segments) >= 2 and segments[-1] == '..':
291 segments[-2:] = ['']
292 return urlunparse((scheme, netloc, '/'.join(segments),
293 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000294
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000295def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000296 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000297
Tim Peterse1190062001-01-15 03:34:38 +0000298 Returns a tuple of the defragmented URL and the fragment. If
299 the URL contained no fragments, the second element is the
300 empty string.
301 """
Fred Drake5751a222001-11-16 02:52:57 +0000302 if '#' in url:
303 s, n, p, a, q, frag = urlparse(url)
304 defrag = urlunparse((s, n, p, a, q, ''))
305 return defrag, frag
306 else:
307 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000308
Facundo Batistac585df92008-09-03 22:35:50 +0000309# unquote method for parse_qs and parse_qsl
R. David Murraybfbdefe2010-05-25 15:20:46 +0000310# Cannot use directly from urllib as it would create a circular reference
311# because urllib uses urlparse methods (urljoin). If you update this function,
312# update it also in urllib. This code duplication does not existin in Python3.
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000313
314_hexdig = '0123456789ABCDEFabcdef'
R. David Murraybfbdefe2010-05-25 15:20:46 +0000315_hextochr = dict((a+b, chr(int(a+b,16)))
316 for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000317
318def unquote(s):
319 """unquote('abc%20def') -> 'abc def'."""
320 res = s.split('%')
R. David Murraybfbdefe2010-05-25 15:20:46 +0000321 # fastpath
322 if len(res) == 1:
323 return s
324 s = res[0]
325 for item in res[1:]:
Facundo Batistac585df92008-09-03 22:35:50 +0000326 try:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000327 s += _hextochr[item[:2]] + item[2:]
Facundo Batistac585df92008-09-03 22:35:50 +0000328 except KeyError:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000329 s += '%' + item
Facundo Batistac585df92008-09-03 22:35:50 +0000330 except UnicodeDecodeError:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000331 s += unichr(int(item[:2], 16)) + item[2:]
332 return s
Facundo Batistac585df92008-09-03 22:35:50 +0000333
334def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
335 """Parse a query given as a string argument.
336
337 Arguments:
338
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000339 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000340
341 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000342 percent-encoded queries should be treated as blank strings.
Facundo Batistac585df92008-09-03 22:35:50 +0000343 A true value indicates that blanks should be retained as
344 blank strings. The default false value indicates that
345 blank values are to be ignored and treated as if they were
346 not included.
347
348 strict_parsing: flag indicating what to do with parsing errors.
349 If false (the default), errors are silently ignored.
350 If true, errors raise a ValueError exception.
351 """
352 dict = {}
353 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
354 if name in dict:
355 dict[name].append(value)
356 else:
357 dict[name] = [value]
358 return dict
359
360def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
361 """Parse a query given as a string argument.
362
363 Arguments:
364
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000365 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000366
367 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000368 percent-encoded queries should be treated as blank strings. A
Facundo Batistac585df92008-09-03 22:35:50 +0000369 true value indicates that blanks should be retained as blank
370 strings. The default false value indicates that blank values
371 are to be ignored and treated as if they were not included.
372
373 strict_parsing: flag indicating what to do with parsing errors. If
374 false (the default), errors are silently ignored. If true,
375 errors raise a ValueError exception.
376
377 Returns a list, as G-d intended.
378 """
379 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
380 r = []
381 for name_value in pairs:
382 if not name_value and not strict_parsing:
383 continue
384 nv = name_value.split('=', 1)
385 if len(nv) != 2:
386 if strict_parsing:
387 raise ValueError, "bad query field: %r" % (name_value,)
388 # Handle case of a control-name with no equal sign
389 if keep_blank_values:
390 nv.append('')
391 else:
392 continue
393 if len(nv[1]) or keep_blank_values:
394 name = unquote(nv[0].replace('+', ' '))
395 value = unquote(nv[1].replace('+', ' '))
396 r.append((name, value))
397
398 return r