blob: 8a207565030567ea6507a617787541a7b10c416d [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran420ec8a2010-04-17 14:30:53 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000020McCahill, December 1994
21
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000027test_urlparse.py provides a good indicator of parsing behavior.
28
Guido van Rossume7b146f2000-02-04 15:28:42 +000029"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000030
Fred Drakef606e8d2002-10-16 21:21:39 +000031__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000032 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000033
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000035uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000036 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaranf432aec2011-08-03 18:31:59 +080037 'prospero', 'rtsp', 'rtspu', '', 'sftp',
Senthil Kumaran0df24c92011-08-03 18:40:18 +080038 'svn', 'svn+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000039uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000040 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
41 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran81a04502010-05-13 03:25:21 +000042 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000043uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000044 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000045 'mms', '', 'sftp']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000046
47# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000048scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
49 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
50 '0123456789'
51 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000052
Guido van Rossum74495401997-07-14 19:08:15 +000053MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000054_parse_cache = {}
55
56def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000057 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000058 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000059
60
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000061class ResultMixin(object):
62 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000063
64 @property
65 def username(self):
66 netloc = self.netloc
67 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000068 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000069 if ":" in userinfo:
70 userinfo = userinfo.split(":", 1)[0]
71 return userinfo
72 return None
73
74 @property
75 def password(self):
76 netloc = self.netloc
77 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000078 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000079 if ":" in userinfo:
80 return userinfo.split(":", 1)[1]
81 return None
82
83 @property
84 def hostname(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000085 netloc = self.netloc.split('@')[-1]
86 if '[' in netloc and ']' in netloc:
87 return netloc.split(']')[0][1:].lower()
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000088 elif ':' in netloc:
89 return netloc.split(':')[0].lower()
90 elif netloc == '':
91 return None
92 else:
93 return netloc.lower()
Fred Drakead5177c2006-04-01 22:14:43 +000094
95 @property
96 def port(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000097 netloc = self.netloc.split('@')[-1].split(']')[-1]
98 if ':' in netloc:
99 port = netloc.split(':')[1]
Senthil Kumaran37484dc2012-05-24 21:54:34 +0800100 port = int(port, 10)
101 # verify legal port
102 if (0 <= port <= 65535):
103 return port
104 return None
Fred Drakead5177c2006-04-01 22:14:43 +0000105
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000106from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000107
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000108class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000109
110 __slots__ = ()
111
Fred Drakead5177c2006-04-01 22:14:43 +0000112 def geturl(self):
113 return urlunsplit(self)
114
115
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000116class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000117
118 __slots__ = ()
119
Fred Drakead5177c2006-04-01 22:14:43 +0000120 def geturl(self):
121 return urlunparse(self)
122
123
124def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000125 """Parse a URL into 6 components:
126 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
127 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
128 Note that we don't break the components up in smaller bits
129 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000130 tuple = urlsplit(url, scheme, allow_fragments)
131 scheme, netloc, url, query, fragment = tuple
132 if scheme in uses_params and ';' in url:
133 url, params = _splitparams(url)
134 else:
135 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000136 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000137
138def _splitparams(url):
139 if '/' in url:
140 i = url.find(';', url.rfind('/'))
141 if i < 0:
142 return url, ''
143 else:
144 i = url.find(';')
145 return url[:i], url[i+1:]
146
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000147def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000148 delim = len(url) # position of end of domain part of url, default is end
149 for c in '/?#': # look for delimiters; the order is NOT important
150 wdelim = url.find(c, start) # find first of this delim
151 if wdelim >= 0: # if found
152 delim = min(delim, wdelim) # use earliest delim position
153 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000154
Fred Drakead5177c2006-04-01 22:14:43 +0000155def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000156 """Parse a URL into 5 components:
157 <scheme>://<netloc>/<path>?<query>#<fragment>
158 Return a 5-tuple: (scheme, netloc, path, query, fragment).
159 Note that we don't break the components up in smaller bits
160 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000161 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000162 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000163 cached = _parse_cache.get(key, None)
164 if cached:
165 return cached
166 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
167 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000168 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000169 i = url.find(':')
170 if i > 0:
171 if url[:i] == 'http': # optimize the common case
172 scheme = url[:i].lower()
173 url = url[i+1:]
174 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000175 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000176 if (('[' in netloc and ']' not in netloc) or
177 (']' in netloc and '[' not in netloc)):
178 raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000179 if allow_fragments and '#' in url:
180 url, fragment = url.split('#', 1)
181 if '?' in url:
182 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000183 v = SplitResult(scheme, netloc, url, query, fragment)
184 _parse_cache[key] = v
185 return v
Senthil Kumaranddaea1c2011-04-15 18:07:33 +0800186 for c in url[:i]:
187 if c not in scheme_chars:
188 break
189 else:
Ezio Melotti6d9c1b12012-05-19 17:12:17 +0300190 # make sure "url" is not actually a port number (in which case
191 # "scheme" is really part of the path)
192 rest = url[i+1:]
193 if not rest or any(c not in '0123456789' for c in rest):
194 # not a port number
195 scheme, url = url[:i].lower(), rest
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000196
197 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000198 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000199 if (('[' in netloc and ']' not in netloc) or
200 (']' in netloc and '[' not in netloc)):
201 raise ValueError("Invalid IPv6 URL")
Senthil Kumaranea24dda2012-05-19 08:10:40 +0800202 if allow_fragments and '#' in url:
Fred Drake5751a222001-11-16 02:52:57 +0000203 url, fragment = url.split('#', 1)
Senthil Kumaranea24dda2012-05-19 08:10:40 +0800204 if '?' in url:
Fred Drake5751a222001-11-16 02:52:57 +0000205 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000206 v = SplitResult(scheme, netloc, url, query, fragment)
207 _parse_cache[key] = v
208 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000209
Brett Cannon89318d82008-08-03 00:51:02 +0000210def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000211 """Put a parsed URL back together again. This may result in a
212 slightly different, but equivalent URL, if the URL that was parsed
213 originally had redundant delimiters, e.g. a ? with an empty query
214 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000215 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000216 if params:
217 url = "%s;%s" % (url, params)
218 return urlunsplit((scheme, netloc, url, query, fragment))
219
Brett Cannon89318d82008-08-03 00:51:02 +0000220def urlunsplit(data):
Senthil Kumaran5871a8d2010-06-28 13:56:46 +0000221 """Combine the elements of a tuple as returned by urlsplit() into a
222 complete URL as a string. The data argument can be any five-item iterable.
223 This may result in a slightly different, but equivalent URL, if the URL that
224 was parsed originally had unnecessary delimiters (for example, a ? with an
225 empty query; the RFC states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000226 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000227 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000228 if url and url[:1] != '/': url = '/' + url
229 url = '//' + (netloc or '') + url
230 if scheme:
231 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000232 if query:
233 url = url + '?' + query
234 if fragment:
235 url = url + '#' + fragment
236 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000237
Fred Drakead5177c2006-04-01 22:14:43 +0000238def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000239 """Join a base URL and a possibly relative URL to form an absolute
240 interpretation of the latter."""
241 if not base:
242 return url
243 if not url:
244 return base
245 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
246 urlparse(base, '', allow_fragments)
247 scheme, netloc, path, params, query, fragment = \
248 urlparse(url, bscheme, allow_fragments)
249 if scheme != bscheme or scheme not in uses_relative:
250 return url
251 if scheme in uses_netloc:
252 if netloc:
253 return urlunparse((scheme, netloc, path,
254 params, query, fragment))
255 netloc = bnetloc
256 if path[:1] == '/':
257 return urlunparse((scheme, netloc, path,
258 params, query, fragment))
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000259 if not path and not params:
Facundo Batista67d19812008-08-14 16:51:00 +0000260 path = bpath
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000261 params = bparams
Facundo Batista67d19812008-08-14 16:51:00 +0000262 if not query:
263 query = bquery
264 return urlunparse((scheme, netloc, path,
265 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000266 segments = bpath.split('/')[:-1] + path.split('/')
267 # XXX The stuff below is bogus in various ways...
268 if segments[-1] == '.':
269 segments[-1] = ''
270 while '.' in segments:
271 segments.remove('.')
272 while 1:
273 i = 1
274 n = len(segments) - 1
275 while i < n:
276 if (segments[i] == '..'
277 and segments[i-1] not in ('', '..')):
278 del segments[i-1:i+1]
279 break
280 i = i+1
281 else:
282 break
283 if segments == ['', '..']:
284 segments[-1] = ''
285 elif len(segments) >= 2 and segments[-1] == '..':
286 segments[-2:] = ['']
287 return urlunparse((scheme, netloc, '/'.join(segments),
288 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000289
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000290def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000291 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000292
Tim Peterse1190062001-01-15 03:34:38 +0000293 Returns a tuple of the defragmented URL and the fragment. If
294 the URL contained no fragments, the second element is the
295 empty string.
296 """
Fred Drake5751a222001-11-16 02:52:57 +0000297 if '#' in url:
298 s, n, p, a, q, frag = urlparse(url)
299 defrag = urlunparse((s, n, p, a, q, ''))
300 return defrag, frag
301 else:
302 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000303
Facundo Batistac585df92008-09-03 22:35:50 +0000304# unquote method for parse_qs and parse_qsl
R. David Murraybfbdefe2010-05-25 15:20:46 +0000305# Cannot use directly from urllib as it would create a circular reference
306# because urllib uses urlparse methods (urljoin). If you update this function,
307# update it also in urllib. This code duplication does not existin in Python3.
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000308
309_hexdig = '0123456789ABCDEFabcdef'
R. David Murraybfbdefe2010-05-25 15:20:46 +0000310_hextochr = dict((a+b, chr(int(a+b,16)))
311 for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000312
313def unquote(s):
314 """unquote('abc%20def') -> 'abc def'."""
315 res = s.split('%')
R. David Murraybfbdefe2010-05-25 15:20:46 +0000316 # fastpath
317 if len(res) == 1:
318 return s
319 s = res[0]
320 for item in res[1:]:
Facundo Batistac585df92008-09-03 22:35:50 +0000321 try:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000322 s += _hextochr[item[:2]] + item[2:]
Facundo Batistac585df92008-09-03 22:35:50 +0000323 except KeyError:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000324 s += '%' + item
Facundo Batistac585df92008-09-03 22:35:50 +0000325 except UnicodeDecodeError:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000326 s += unichr(int(item[:2], 16)) + item[2:]
327 return s
Facundo Batistac585df92008-09-03 22:35:50 +0000328
329def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
330 """Parse a query given as a string argument.
331
332 Arguments:
333
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000334 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000335
336 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000337 percent-encoded queries should be treated as blank strings.
Facundo Batistac585df92008-09-03 22:35:50 +0000338 A true value indicates that blanks should be retained as
339 blank strings. The default false value indicates that
340 blank values are to be ignored and treated as if they were
341 not included.
342
343 strict_parsing: flag indicating what to do with parsing errors.
344 If false (the default), errors are silently ignored.
345 If true, errors raise a ValueError exception.
346 """
347 dict = {}
348 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
349 if name in dict:
350 dict[name].append(value)
351 else:
352 dict[name] = [value]
353 return dict
354
355def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
356 """Parse a query given as a string argument.
357
358 Arguments:
359
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000360 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000361
362 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000363 percent-encoded queries should be treated as blank strings. A
Facundo Batistac585df92008-09-03 22:35:50 +0000364 true value indicates that blanks should be retained as blank
365 strings. The default false value indicates that blank values
366 are to be ignored and treated as if they were not included.
367
368 strict_parsing: flag indicating what to do with parsing errors. If
369 false (the default), errors are silently ignored. If true,
370 errors raise a ValueError exception.
371
372 Returns a list, as G-d intended.
373 """
374 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
375 r = []
376 for name_value in pairs:
377 if not name_value and not strict_parsing:
378 continue
379 nv = name_value.split('=', 1)
380 if len(nv) != 2:
381 if strict_parsing:
382 raise ValueError, "bad query field: %r" % (name_value,)
383 # Handle case of a control-name with no equal sign
384 if keep_blank_values:
385 nv.append('')
386 else:
387 continue
388 if len(nv[1]) or keep_blank_values:
389 name = unquote(nv[0].replace('+', ' '))
390 value = unquote(nv[1].replace('+', ' '))
391 r.append((name, value))
392
393 return r