blob: 4cd3d6743a960f9e88f1de18ded659f6a84cc04d [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran420ec8a2010-04-17 14:30:53 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000020McCahill, December 1994
21
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000027test_urlparse.py provides a good indicator of parsing behavior.
28
Guido van Rossume7b146f2000-02-04 15:28:42 +000029"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000030
Serhiy Storchaka923baea2013-03-14 21:31:09 +020031import re
32
Fred Drakef606e8d2002-10-16 21:21:39 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000034 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000035
Guido van Rossum23cb2a81994-09-12 10:36:35 +000036# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000037uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000038 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaranf432aec2011-08-03 18:31:59 +080039 'prospero', 'rtsp', 'rtspu', '', 'sftp',
Senthil Kumaran0df24c92011-08-03 18:40:18 +080040 'svn', 'svn+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000041uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000042 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
43 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran81a04502010-05-13 03:25:21 +000044 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000045uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000046 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaran1974baa2012-12-24 13:56:54 -080047 'mms', '', 'sftp', 'tel']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000048
Georg Brandle9912362012-08-24 18:17:28 +020049# These are not actually used anymore, but should stay for backwards
50# compatibility. (They are undocumented, but have a public-looking name.)
51non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
52 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
53uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
54 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
55uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
56 'nntp', 'wais', 'https', 'shttp', 'snews',
57 'file', 'prospero', '']
58
Guido van Rossum23cb2a81994-09-12 10:36:35 +000059# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000060scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
61 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
62 '0123456789'
63 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000064
Guido van Rossum74495401997-07-14 19:08:15 +000065MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000066_parse_cache = {}
67
68def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000069 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000070 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000071
72
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000073class ResultMixin(object):
74 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000075
76 @property
77 def username(self):
78 netloc = self.netloc
79 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000080 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000081 if ":" in userinfo:
82 userinfo = userinfo.split(":", 1)[0]
83 return userinfo
84 return None
85
86 @property
87 def password(self):
88 netloc = self.netloc
89 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000090 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000091 if ":" in userinfo:
92 return userinfo.split(":", 1)[1]
93 return None
94
95 @property
96 def hostname(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000097 netloc = self.netloc.split('@')[-1]
98 if '[' in netloc and ']' in netloc:
99 return netloc.split(']')[0][1:].lower()
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000100 elif ':' in netloc:
101 return netloc.split(':')[0].lower()
102 elif netloc == '':
103 return None
104 else:
105 return netloc.lower()
Fred Drakead5177c2006-04-01 22:14:43 +0000106
107 @property
108 def port(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000109 netloc = self.netloc.split('@')[-1].split(']')[-1]
110 if ':' in netloc:
111 port = netloc.split(':')[1]
Serhiy Storchaka326b5ab2014-01-18 18:30:09 +0200112 if port:
113 port = int(port, 10)
114 # verify legal port
115 if (0 <= port <= 65535):
116 return port
Senthil Kumaran37484dc2012-05-24 21:54:34 +0800117 return None
Fred Drakead5177c2006-04-01 22:14:43 +0000118
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000119from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000120
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000121class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000122
123 __slots__ = ()
124
Fred Drakead5177c2006-04-01 22:14:43 +0000125 def geturl(self):
126 return urlunsplit(self)
127
128
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000129class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000130
131 __slots__ = ()
132
Fred Drakead5177c2006-04-01 22:14:43 +0000133 def geturl(self):
134 return urlunparse(self)
135
136
137def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000138 """Parse a URL into 6 components:
139 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
140 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
141 Note that we don't break the components up in smaller bits
142 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000143 tuple = urlsplit(url, scheme, allow_fragments)
144 scheme, netloc, url, query, fragment = tuple
145 if scheme in uses_params and ';' in url:
146 url, params = _splitparams(url)
147 else:
148 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000149 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000150
151def _splitparams(url):
152 if '/' in url:
153 i = url.find(';', url.rfind('/'))
154 if i < 0:
155 return url, ''
156 else:
157 i = url.find(';')
158 return url[:i], url[i+1:]
159
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000160def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000161 delim = len(url) # position of end of domain part of url, default is end
162 for c in '/?#': # look for delimiters; the order is NOT important
163 wdelim = url.find(c, start) # find first of this delim
164 if wdelim >= 0: # if found
165 delim = min(delim, wdelim) # use earliest delim position
166 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000167
Fred Drakead5177c2006-04-01 22:14:43 +0000168def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000169 """Parse a URL into 5 components:
170 <scheme>://<netloc>/<path>?<query>#<fragment>
171 Return a 5-tuple: (scheme, netloc, path, query, fragment).
172 Note that we don't break the components up in smaller bits
173 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000174 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000175 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000176 cached = _parse_cache.get(key, None)
177 if cached:
178 return cached
179 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
180 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000181 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000182 i = url.find(':')
183 if i > 0:
184 if url[:i] == 'http': # optimize the common case
185 scheme = url[:i].lower()
186 url = url[i+1:]
187 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000188 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000189 if (('[' in netloc and ']' not in netloc) or
190 (']' in netloc and '[' not in netloc)):
191 raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000192 if allow_fragments and '#' in url:
193 url, fragment = url.split('#', 1)
194 if '?' in url:
195 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000196 v = SplitResult(scheme, netloc, url, query, fragment)
197 _parse_cache[key] = v
198 return v
Senthil Kumaranddaea1c2011-04-15 18:07:33 +0800199 for c in url[:i]:
200 if c not in scheme_chars:
201 break
202 else:
Ezio Melotti6d9c1b12012-05-19 17:12:17 +0300203 # make sure "url" is not actually a port number (in which case
204 # "scheme" is really part of the path)
205 rest = url[i+1:]
206 if not rest or any(c not in '0123456789' for c in rest):
207 # not a port number
208 scheme, url = url[:i].lower(), rest
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000209
210 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000211 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000212 if (('[' in netloc and ']' not in netloc) or
213 (']' in netloc and '[' not in netloc)):
214 raise ValueError("Invalid IPv6 URL")
Senthil Kumaranea24dda2012-05-19 08:10:40 +0800215 if allow_fragments and '#' in url:
Fred Drake5751a222001-11-16 02:52:57 +0000216 url, fragment = url.split('#', 1)
Senthil Kumaranea24dda2012-05-19 08:10:40 +0800217 if '?' in url:
Fred Drake5751a222001-11-16 02:52:57 +0000218 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000219 v = SplitResult(scheme, netloc, url, query, fragment)
220 _parse_cache[key] = v
221 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000222
Brett Cannon89318d82008-08-03 00:51:02 +0000223def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000224 """Put a parsed URL back together again. This may result in a
225 slightly different, but equivalent URL, if the URL that was parsed
226 originally had redundant delimiters, e.g. a ? with an empty query
227 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000228 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000229 if params:
230 url = "%s;%s" % (url, params)
231 return urlunsplit((scheme, netloc, url, query, fragment))
232
Brett Cannon89318d82008-08-03 00:51:02 +0000233def urlunsplit(data):
Senthil Kumaran5871a8d2010-06-28 13:56:46 +0000234 """Combine the elements of a tuple as returned by urlsplit() into a
235 complete URL as a string. The data argument can be any five-item iterable.
236 This may result in a slightly different, but equivalent URL, if the URL that
237 was parsed originally had unnecessary delimiters (for example, a ? with an
238 empty query; the RFC states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000239 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000240 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000241 if url and url[:1] != '/': url = '/' + url
242 url = '//' + (netloc or '') + url
243 if scheme:
244 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000245 if query:
246 url = url + '?' + query
247 if fragment:
248 url = url + '#' + fragment
249 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000250
Fred Drakead5177c2006-04-01 22:14:43 +0000251def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000252 """Join a base URL and a possibly relative URL to form an absolute
253 interpretation of the latter."""
254 if not base:
255 return url
256 if not url:
257 return base
258 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
259 urlparse(base, '', allow_fragments)
260 scheme, netloc, path, params, query, fragment = \
261 urlparse(url, bscheme, allow_fragments)
262 if scheme != bscheme or scheme not in uses_relative:
263 return url
264 if scheme in uses_netloc:
265 if netloc:
266 return urlunparse((scheme, netloc, path,
267 params, query, fragment))
268 netloc = bnetloc
269 if path[:1] == '/':
270 return urlunparse((scheme, netloc, path,
271 params, query, fragment))
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000272 if not path and not params:
Facundo Batista67d19812008-08-14 16:51:00 +0000273 path = bpath
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000274 params = bparams
Facundo Batista67d19812008-08-14 16:51:00 +0000275 if not query:
276 query = bquery
277 return urlunparse((scheme, netloc, path,
278 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000279 segments = bpath.split('/')[:-1] + path.split('/')
280 # XXX The stuff below is bogus in various ways...
281 if segments[-1] == '.':
282 segments[-1] = ''
283 while '.' in segments:
284 segments.remove('.')
285 while 1:
286 i = 1
287 n = len(segments) - 1
288 while i < n:
289 if (segments[i] == '..'
290 and segments[i-1] not in ('', '..')):
291 del segments[i-1:i+1]
292 break
293 i = i+1
294 else:
295 break
296 if segments == ['', '..']:
297 segments[-1] = ''
298 elif len(segments) >= 2 and segments[-1] == '..':
299 segments[-2:] = ['']
300 return urlunparse((scheme, netloc, '/'.join(segments),
301 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000302
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000303def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000304 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000305
Tim Peterse1190062001-01-15 03:34:38 +0000306 Returns a tuple of the defragmented URL and the fragment. If
307 the URL contained no fragments, the second element is the
308 empty string.
309 """
Fred Drake5751a222001-11-16 02:52:57 +0000310 if '#' in url:
311 s, n, p, a, q, frag = urlparse(url)
312 defrag = urlunparse((s, n, p, a, q, ''))
313 return defrag, frag
314 else:
315 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000316
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200317try:
318 unicode
319except NameError:
320 def _is_unicode(x):
321 return 0
322else:
323 def _is_unicode(x):
324 return isinstance(x, unicode)
325
Facundo Batistac585df92008-09-03 22:35:50 +0000326# unquote method for parse_qs and parse_qsl
R. David Murraybfbdefe2010-05-25 15:20:46 +0000327# Cannot use directly from urllib as it would create a circular reference
328# because urllib uses urlparse methods (urljoin). If you update this function,
329# update it also in urllib. This code duplication does not existin in Python3.
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000330
331_hexdig = '0123456789ABCDEFabcdef'
R. David Murraybfbdefe2010-05-25 15:20:46 +0000332_hextochr = dict((a+b, chr(int(a+b,16)))
333 for a in _hexdig for b in _hexdig)
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200334_asciire = re.compile('([\x00-\x7f]+)')
Facundo Batistac585df92008-09-03 22:35:50 +0000335
336def unquote(s):
337 """unquote('abc%20def') -> 'abc def'."""
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200338 if _is_unicode(s):
339 if '%' not in s:
340 return s
341 bits = _asciire.split(s)
342 res = [bits[0]]
343 append = res.append
344 for i in range(1, len(bits), 2):
345 append(unquote(str(bits[i])).decode('latin1'))
346 append(bits[i + 1])
347 return ''.join(res)
348
349 bits = s.split('%')
R. David Murraybfbdefe2010-05-25 15:20:46 +0000350 # fastpath
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200351 if len(bits) == 1:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000352 return s
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200353 res = [bits[0]]
354 append = res.append
355 for item in bits[1:]:
Facundo Batistac585df92008-09-03 22:35:50 +0000356 try:
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200357 append(_hextochr[item[:2]])
358 append(item[2:])
Facundo Batistac585df92008-09-03 22:35:50 +0000359 except KeyError:
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200360 append('%')
361 append(item)
362 return ''.join(res)
Facundo Batistac585df92008-09-03 22:35:50 +0000363
364def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
365 """Parse a query given as a string argument.
366
367 Arguments:
368
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000369 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000370
371 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000372 percent-encoded queries should be treated as blank strings.
Facundo Batistac585df92008-09-03 22:35:50 +0000373 A true value indicates that blanks should be retained as
374 blank strings. The default false value indicates that
375 blank values are to be ignored and treated as if they were
376 not included.
377
378 strict_parsing: flag indicating what to do with parsing errors.
379 If false (the default), errors are silently ignored.
380 If true, errors raise a ValueError exception.
381 """
382 dict = {}
383 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
384 if name in dict:
385 dict[name].append(value)
386 else:
387 dict[name] = [value]
388 return dict
389
390def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
391 """Parse a query given as a string argument.
392
393 Arguments:
394
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000395 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000396
397 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000398 percent-encoded queries should be treated as blank strings. A
Facundo Batistac585df92008-09-03 22:35:50 +0000399 true value indicates that blanks should be retained as blank
400 strings. The default false value indicates that blank values
401 are to be ignored and treated as if they were not included.
402
403 strict_parsing: flag indicating what to do with parsing errors. If
404 false (the default), errors are silently ignored. If true,
405 errors raise a ValueError exception.
406
407 Returns a list, as G-d intended.
408 """
409 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
410 r = []
411 for name_value in pairs:
412 if not name_value and not strict_parsing:
413 continue
414 nv = name_value.split('=', 1)
415 if len(nv) != 2:
416 if strict_parsing:
417 raise ValueError, "bad query field: %r" % (name_value,)
418 # Handle case of a control-name with no equal sign
419 if keep_blank_values:
420 nv.append('')
421 else:
422 continue
423 if len(nv[1]) or keep_blank_values:
424 name = unquote(nv[0].replace('+', ' '))
425 value = unquote(nv[1].replace('+', ' '))
426 r.append((name, value))
427
428 return r