blob: f370ce3bdcc17b11e446a1e83a7df74d50c6b7cb [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran420ec8a2010-04-17 14:30:53 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000020McCahill, December 1994
21
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000027test_urlparse.py provides a good indicator of parsing behavior.
28
Guido van Rossume7b146f2000-02-04 15:28:42 +000029"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000030
Fred Drakef606e8d2002-10-16 21:21:39 +000031__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000032 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000033
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000035uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000036 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaranf432aec2011-08-03 18:31:59 +080037 'prospero', 'rtsp', 'rtspu', '', 'sftp',
Senthil Kumaran0df24c92011-08-03 18:40:18 +080038 'svn', 'svn+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000039uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000040 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
41 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran81a04502010-05-13 03:25:21 +000042 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000043uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000044 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaran1974baa2012-12-24 13:56:54 -080045 'mms', '', 'sftp', 'tel']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000046
Georg Brandle9912362012-08-24 18:17:28 +020047# These are not actually used anymore, but should stay for backwards
48# compatibility. (They are undocumented, but have a public-looking name.)
49non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
50 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
51uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
52 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
53uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
54 'nntp', 'wais', 'https', 'shttp', 'snews',
55 'file', 'prospero', '']
56
Guido van Rossum23cb2a81994-09-12 10:36:35 +000057# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000058scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
59 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
60 '0123456789'
61 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000062
Guido van Rossum74495401997-07-14 19:08:15 +000063MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000064_parse_cache = {}
65
66def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000067 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000068 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000069
70
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000071class ResultMixin(object):
72 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000073
74 @property
75 def username(self):
76 netloc = self.netloc
77 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000078 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000079 if ":" in userinfo:
80 userinfo = userinfo.split(":", 1)[0]
81 return userinfo
82 return None
83
84 @property
85 def password(self):
86 netloc = self.netloc
87 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000088 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000089 if ":" in userinfo:
90 return userinfo.split(":", 1)[1]
91 return None
92
93 @property
94 def hostname(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000095 netloc = self.netloc.split('@')[-1]
96 if '[' in netloc and ']' in netloc:
97 return netloc.split(']')[0][1:].lower()
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000098 elif ':' in netloc:
99 return netloc.split(':')[0].lower()
100 elif netloc == '':
101 return None
102 else:
103 return netloc.lower()
Fred Drakead5177c2006-04-01 22:14:43 +0000104
105 @property
106 def port(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000107 netloc = self.netloc.split('@')[-1].split(']')[-1]
108 if ':' in netloc:
109 port = netloc.split(':')[1]
Senthil Kumaran37484dc2012-05-24 21:54:34 +0800110 port = int(port, 10)
111 # verify legal port
112 if (0 <= port <= 65535):
113 return port
114 return None
Fred Drakead5177c2006-04-01 22:14:43 +0000115
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000116from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000117
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000118class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000119
120 __slots__ = ()
121
Fred Drakead5177c2006-04-01 22:14:43 +0000122 def geturl(self):
123 return urlunsplit(self)
124
125
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000126class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000127
128 __slots__ = ()
129
Fred Drakead5177c2006-04-01 22:14:43 +0000130 def geturl(self):
131 return urlunparse(self)
132
133
134def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000135 """Parse a URL into 6 components:
136 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
137 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
138 Note that we don't break the components up in smaller bits
139 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000140 tuple = urlsplit(url, scheme, allow_fragments)
141 scheme, netloc, url, query, fragment = tuple
142 if scheme in uses_params and ';' in url:
143 url, params = _splitparams(url)
144 else:
145 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000146 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000147
148def _splitparams(url):
149 if '/' in url:
150 i = url.find(';', url.rfind('/'))
151 if i < 0:
152 return url, ''
153 else:
154 i = url.find(';')
155 return url[:i], url[i+1:]
156
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000157def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000158 delim = len(url) # position of end of domain part of url, default is end
159 for c in '/?#': # look for delimiters; the order is NOT important
160 wdelim = url.find(c, start) # find first of this delim
161 if wdelim >= 0: # if found
162 delim = min(delim, wdelim) # use earliest delim position
163 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000164
Fred Drakead5177c2006-04-01 22:14:43 +0000165def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000166 """Parse a URL into 5 components:
167 <scheme>://<netloc>/<path>?<query>#<fragment>
168 Return a 5-tuple: (scheme, netloc, path, query, fragment).
169 Note that we don't break the components up in smaller bits
170 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000171 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000172 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000173 cached = _parse_cache.get(key, None)
174 if cached:
175 return cached
176 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
177 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000178 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000179 i = url.find(':')
180 if i > 0:
181 if url[:i] == 'http': # optimize the common case
182 scheme = url[:i].lower()
183 url = url[i+1:]
184 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000185 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000186 if (('[' in netloc and ']' not in netloc) or
187 (']' in netloc and '[' not in netloc)):
188 raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000189 if allow_fragments and '#' in url:
190 url, fragment = url.split('#', 1)
191 if '?' in url:
192 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000193 v = SplitResult(scheme, netloc, url, query, fragment)
194 _parse_cache[key] = v
195 return v
Senthil Kumaranddaea1c2011-04-15 18:07:33 +0800196 for c in url[:i]:
197 if c not in scheme_chars:
198 break
199 else:
Ezio Melotti6d9c1b12012-05-19 17:12:17 +0300200 # make sure "url" is not actually a port number (in which case
201 # "scheme" is really part of the path)
202 rest = url[i+1:]
203 if not rest or any(c not in '0123456789' for c in rest):
204 # not a port number
205 scheme, url = url[:i].lower(), rest
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000206
207 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000208 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000209 if (('[' in netloc and ']' not in netloc) or
210 (']' in netloc and '[' not in netloc)):
211 raise ValueError("Invalid IPv6 URL")
Senthil Kumaranea24dda2012-05-19 08:10:40 +0800212 if allow_fragments and '#' in url:
Fred Drake5751a222001-11-16 02:52:57 +0000213 url, fragment = url.split('#', 1)
Senthil Kumaranea24dda2012-05-19 08:10:40 +0800214 if '?' in url:
Fred Drake5751a222001-11-16 02:52:57 +0000215 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000216 v = SplitResult(scheme, netloc, url, query, fragment)
217 _parse_cache[key] = v
218 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000219
Brett Cannon89318d82008-08-03 00:51:02 +0000220def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000221 """Put a parsed URL back together again. This may result in a
222 slightly different, but equivalent URL, if the URL that was parsed
223 originally had redundant delimiters, e.g. a ? with an empty query
224 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000225 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000226 if params:
227 url = "%s;%s" % (url, params)
228 return urlunsplit((scheme, netloc, url, query, fragment))
229
Brett Cannon89318d82008-08-03 00:51:02 +0000230def urlunsplit(data):
Senthil Kumaran5871a8d2010-06-28 13:56:46 +0000231 """Combine the elements of a tuple as returned by urlsplit() into a
232 complete URL as a string. The data argument can be any five-item iterable.
233 This may result in a slightly different, but equivalent URL, if the URL that
234 was parsed originally had unnecessary delimiters (for example, a ? with an
235 empty query; the RFC states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000236 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000237 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000238 if url and url[:1] != '/': url = '/' + url
239 url = '//' + (netloc or '') + url
240 if scheme:
241 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000242 if query:
243 url = url + '?' + query
244 if fragment:
245 url = url + '#' + fragment
246 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000247
Fred Drakead5177c2006-04-01 22:14:43 +0000248def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000249 """Join a base URL and a possibly relative URL to form an absolute
250 interpretation of the latter."""
251 if not base:
252 return url
253 if not url:
254 return base
255 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
256 urlparse(base, '', allow_fragments)
257 scheme, netloc, path, params, query, fragment = \
258 urlparse(url, bscheme, allow_fragments)
259 if scheme != bscheme or scheme not in uses_relative:
260 return url
261 if scheme in uses_netloc:
262 if netloc:
263 return urlunparse((scheme, netloc, path,
264 params, query, fragment))
265 netloc = bnetloc
266 if path[:1] == '/':
267 return urlunparse((scheme, netloc, path,
268 params, query, fragment))
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000269 if not path and not params:
Facundo Batista67d19812008-08-14 16:51:00 +0000270 path = bpath
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000271 params = bparams
Facundo Batista67d19812008-08-14 16:51:00 +0000272 if not query:
273 query = bquery
274 return urlunparse((scheme, netloc, path,
275 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000276 segments = bpath.split('/')[:-1] + path.split('/')
277 # XXX The stuff below is bogus in various ways...
278 if segments[-1] == '.':
279 segments[-1] = ''
280 while '.' in segments:
281 segments.remove('.')
282 while 1:
283 i = 1
284 n = len(segments) - 1
285 while i < n:
286 if (segments[i] == '..'
287 and segments[i-1] not in ('', '..')):
288 del segments[i-1:i+1]
289 break
290 i = i+1
291 else:
292 break
293 if segments == ['', '..']:
294 segments[-1] = ''
295 elif len(segments) >= 2 and segments[-1] == '..':
296 segments[-2:] = ['']
297 return urlunparse((scheme, netloc, '/'.join(segments),
298 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000299
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000300def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000301 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000302
Tim Peterse1190062001-01-15 03:34:38 +0000303 Returns a tuple of the defragmented URL and the fragment. If
304 the URL contained no fragments, the second element is the
305 empty string.
306 """
Fred Drake5751a222001-11-16 02:52:57 +0000307 if '#' in url:
308 s, n, p, a, q, frag = urlparse(url)
309 defrag = urlunparse((s, n, p, a, q, ''))
310 return defrag, frag
311 else:
312 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000313
Facundo Batistac585df92008-09-03 22:35:50 +0000314# unquote method for parse_qs and parse_qsl
R. David Murraybfbdefe2010-05-25 15:20:46 +0000315# Cannot use directly from urllib as it would create a circular reference
316# because urllib uses urlparse methods (urljoin). If you update this function,
317# update it also in urllib. This code duplication does not existin in Python3.
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000318
319_hexdig = '0123456789ABCDEFabcdef'
R. David Murraybfbdefe2010-05-25 15:20:46 +0000320_hextochr = dict((a+b, chr(int(a+b,16)))
321 for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000322
323def unquote(s):
324 """unquote('abc%20def') -> 'abc def'."""
325 res = s.split('%')
R. David Murraybfbdefe2010-05-25 15:20:46 +0000326 # fastpath
327 if len(res) == 1:
328 return s
329 s = res[0]
330 for item in res[1:]:
Facundo Batistac585df92008-09-03 22:35:50 +0000331 try:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000332 s += _hextochr[item[:2]] + item[2:]
Facundo Batistac585df92008-09-03 22:35:50 +0000333 except KeyError:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000334 s += '%' + item
Facundo Batistac585df92008-09-03 22:35:50 +0000335 except UnicodeDecodeError:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000336 s += unichr(int(item[:2], 16)) + item[2:]
337 return s
Facundo Batistac585df92008-09-03 22:35:50 +0000338
339def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
340 """Parse a query given as a string argument.
341
342 Arguments:
343
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000344 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000345
346 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000347 percent-encoded queries should be treated as blank strings.
Facundo Batistac585df92008-09-03 22:35:50 +0000348 A true value indicates that blanks should be retained as
349 blank strings. The default false value indicates that
350 blank values are to be ignored and treated as if they were
351 not included.
352
353 strict_parsing: flag indicating what to do with parsing errors.
354 If false (the default), errors are silently ignored.
355 If true, errors raise a ValueError exception.
356 """
357 dict = {}
358 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
359 if name in dict:
360 dict[name].append(value)
361 else:
362 dict[name] = [value]
363 return dict
364
365def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
366 """Parse a query given as a string argument.
367
368 Arguments:
369
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000370 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000371
372 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000373 percent-encoded queries should be treated as blank strings. A
Facundo Batistac585df92008-09-03 22:35:50 +0000374 true value indicates that blanks should be retained as blank
375 strings. The default false value indicates that blank values
376 are to be ignored and treated as if they were not included.
377
378 strict_parsing: flag indicating what to do with parsing errors. If
379 false (the default), errors are silently ignored. If true,
380 errors raise a ValueError exception.
381
382 Returns a list, as G-d intended.
383 """
384 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
385 r = []
386 for name_value in pairs:
387 if not name_value and not strict_parsing:
388 continue
389 nv = name_value.split('=', 1)
390 if len(nv) != 2:
391 if strict_parsing:
392 raise ValueError, "bad query field: %r" % (name_value,)
393 # Handle case of a control-name with no equal sign
394 if keep_blank_values:
395 nv.append('')
396 else:
397 continue
398 if len(nv[1]) or keep_blank_values:
399 name = unquote(nv[0].replace('+', ' '))
400 value = unquote(nv[1].replace('+', ' '))
401 r.append((name, value))
402
403 return r