blob: 4ce982e8fdd4e52e403a41520ec566f17f0d1f5a [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran420ec8a2010-04-17 14:30:53 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000020McCahill, December 1994
21
Andrew M. Kuchlingba88b7f2010-04-30 00:49:09 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran420ec8a2010-04-17 14:30:53 +000027test_urlparse.py provides a good indicator of parsing behavior.
28
Guido van Rossume7b146f2000-02-04 15:28:42 +000029"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000030
Serhiy Storchaka923baea2013-03-14 21:31:09 +020031import re
32
Fred Drakef606e8d2002-10-16 21:21:39 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000034 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000035
Guido van Rossum23cb2a81994-09-12 10:36:35 +000036# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000037uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000038 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaranf432aec2011-08-03 18:31:59 +080039 'prospero', 'rtsp', 'rtspu', '', 'sftp',
Senthil Kumaran0df24c92011-08-03 18:40:18 +080040 'svn', 'svn+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000041uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000042 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
43 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran81a04502010-05-13 03:25:21 +000044 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000045uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000046 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaran1974baa2012-12-24 13:56:54 -080047 'mms', '', 'sftp', 'tel']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000048
Georg Brandle9912362012-08-24 18:17:28 +020049# These are not actually used anymore, but should stay for backwards
50# compatibility. (They are undocumented, but have a public-looking name.)
51non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
52 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
53uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
54 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
55uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
56 'nntp', 'wais', 'https', 'shttp', 'snews',
57 'file', 'prospero', '']
58
Guido van Rossum23cb2a81994-09-12 10:36:35 +000059# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000060scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
61 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
62 '0123456789'
63 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000064
Guido van Rossum74495401997-07-14 19:08:15 +000065MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000066_parse_cache = {}
67
68def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000069 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000070 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000071
72
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000073class ResultMixin(object):
74 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000075
76 @property
77 def username(self):
78 netloc = self.netloc
79 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000080 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000081 if ":" in userinfo:
82 userinfo = userinfo.split(":", 1)[0]
83 return userinfo
84 return None
85
86 @property
87 def password(self):
88 netloc = self.netloc
89 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000090 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000091 if ":" in userinfo:
92 return userinfo.split(":", 1)[1]
93 return None
94
95 @property
96 def hostname(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000097 netloc = self.netloc.split('@')[-1]
98 if '[' in netloc and ']' in netloc:
99 return netloc.split(']')[0][1:].lower()
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000100 elif ':' in netloc:
101 return netloc.split(':')[0].lower()
102 elif netloc == '':
103 return None
104 else:
105 return netloc.lower()
Fred Drakead5177c2006-04-01 22:14:43 +0000106
107 @property
108 def port(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000109 netloc = self.netloc.split('@')[-1].split(']')[-1]
110 if ':' in netloc:
111 port = netloc.split(':')[1]
Senthil Kumaran37484dc2012-05-24 21:54:34 +0800112 port = int(port, 10)
113 # verify legal port
114 if (0 <= port <= 65535):
115 return port
116 return None
Fred Drakead5177c2006-04-01 22:14:43 +0000117
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000118from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000119
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000120class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000121
122 __slots__ = ()
123
Fred Drakead5177c2006-04-01 22:14:43 +0000124 def geturl(self):
125 return urlunsplit(self)
126
127
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000128class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000129
130 __slots__ = ()
131
Fred Drakead5177c2006-04-01 22:14:43 +0000132 def geturl(self):
133 return urlunparse(self)
134
135
136def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000137 """Parse a URL into 6 components:
138 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
139 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
140 Note that we don't break the components up in smaller bits
141 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000142 tuple = urlsplit(url, scheme, allow_fragments)
143 scheme, netloc, url, query, fragment = tuple
144 if scheme in uses_params and ';' in url:
145 url, params = _splitparams(url)
146 else:
147 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000148 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000149
150def _splitparams(url):
151 if '/' in url:
152 i = url.find(';', url.rfind('/'))
153 if i < 0:
154 return url, ''
155 else:
156 i = url.find(';')
157 return url[:i], url[i+1:]
158
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000159def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000160 delim = len(url) # position of end of domain part of url, default is end
161 for c in '/?#': # look for delimiters; the order is NOT important
162 wdelim = url.find(c, start) # find first of this delim
163 if wdelim >= 0: # if found
164 delim = min(delim, wdelim) # use earliest delim position
165 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000166
Fred Drakead5177c2006-04-01 22:14:43 +0000167def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000168 """Parse a URL into 5 components:
169 <scheme>://<netloc>/<path>?<query>#<fragment>
170 Return a 5-tuple: (scheme, netloc, path, query, fragment).
171 Note that we don't break the components up in smaller bits
172 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000173 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000174 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000175 cached = _parse_cache.get(key, None)
176 if cached:
177 return cached
178 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
179 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000180 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000181 i = url.find(':')
182 if i > 0:
183 if url[:i] == 'http': # optimize the common case
184 scheme = url[:i].lower()
185 url = url[i+1:]
186 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000187 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000188 if (('[' in netloc and ']' not in netloc) or
189 (']' in netloc and '[' not in netloc)):
190 raise ValueError("Invalid IPv6 URL")
Fred Drake5751a222001-11-16 02:52:57 +0000191 if allow_fragments and '#' in url:
192 url, fragment = url.split('#', 1)
193 if '?' in url:
194 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000195 v = SplitResult(scheme, netloc, url, query, fragment)
196 _parse_cache[key] = v
197 return v
Senthil Kumaranddaea1c2011-04-15 18:07:33 +0800198 for c in url[:i]:
199 if c not in scheme_chars:
200 break
201 else:
Ezio Melotti6d9c1b12012-05-19 17:12:17 +0300202 # make sure "url" is not actually a port number (in which case
203 # "scheme" is really part of the path)
204 rest = url[i+1:]
205 if not rest or any(c not in '0123456789' for c in rest):
206 # not a port number
207 scheme, url = url[:i].lower(), rest
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000208
209 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000210 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran39824612010-04-22 12:10:13 +0000211 if (('[' in netloc and ']' not in netloc) or
212 (']' in netloc and '[' not in netloc)):
213 raise ValueError("Invalid IPv6 URL")
Senthil Kumaranea24dda2012-05-19 08:10:40 +0800214 if allow_fragments and '#' in url:
Fred Drake5751a222001-11-16 02:52:57 +0000215 url, fragment = url.split('#', 1)
Senthil Kumaranea24dda2012-05-19 08:10:40 +0800216 if '?' in url:
Fred Drake5751a222001-11-16 02:52:57 +0000217 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000218 v = SplitResult(scheme, netloc, url, query, fragment)
219 _parse_cache[key] = v
220 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000221
Brett Cannon89318d82008-08-03 00:51:02 +0000222def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000223 """Put a parsed URL back together again. This may result in a
224 slightly different, but equivalent URL, if the URL that was parsed
225 originally had redundant delimiters, e.g. a ? with an empty query
226 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000227 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000228 if params:
229 url = "%s;%s" % (url, params)
230 return urlunsplit((scheme, netloc, url, query, fragment))
231
Brett Cannon89318d82008-08-03 00:51:02 +0000232def urlunsplit(data):
Senthil Kumaran5871a8d2010-06-28 13:56:46 +0000233 """Combine the elements of a tuple as returned by urlsplit() into a
234 complete URL as a string. The data argument can be any five-item iterable.
235 This may result in a slightly different, but equivalent URL, if the URL that
236 was parsed originally had unnecessary delimiters (for example, a ? with an
237 empty query; the RFC states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000238 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000239 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000240 if url and url[:1] != '/': url = '/' + url
241 url = '//' + (netloc or '') + url
242 if scheme:
243 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000244 if query:
245 url = url + '?' + query
246 if fragment:
247 url = url + '#' + fragment
248 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000249
Fred Drakead5177c2006-04-01 22:14:43 +0000250def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000251 """Join a base URL and a possibly relative URL to form an absolute
252 interpretation of the latter."""
253 if not base:
254 return url
255 if not url:
256 return base
257 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
258 urlparse(base, '', allow_fragments)
259 scheme, netloc, path, params, query, fragment = \
260 urlparse(url, bscheme, allow_fragments)
261 if scheme != bscheme or scheme not in uses_relative:
262 return url
263 if scheme in uses_netloc:
264 if netloc:
265 return urlunparse((scheme, netloc, path,
266 params, query, fragment))
267 netloc = bnetloc
268 if path[:1] == '/':
269 return urlunparse((scheme, netloc, path,
270 params, query, fragment))
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000271 if not path and not params:
Facundo Batista67d19812008-08-14 16:51:00 +0000272 path = bpath
Senthil Kumaran5c7fd6e2010-12-17 04:56:02 +0000273 params = bparams
Facundo Batista67d19812008-08-14 16:51:00 +0000274 if not query:
275 query = bquery
276 return urlunparse((scheme, netloc, path,
277 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000278 segments = bpath.split('/')[:-1] + path.split('/')
279 # XXX The stuff below is bogus in various ways...
280 if segments[-1] == '.':
281 segments[-1] = ''
282 while '.' in segments:
283 segments.remove('.')
284 while 1:
285 i = 1
286 n = len(segments) - 1
287 while i < n:
288 if (segments[i] == '..'
289 and segments[i-1] not in ('', '..')):
290 del segments[i-1:i+1]
291 break
292 i = i+1
293 else:
294 break
295 if segments == ['', '..']:
296 segments[-1] = ''
297 elif len(segments) >= 2 and segments[-1] == '..':
298 segments[-2:] = ['']
299 return urlunparse((scheme, netloc, '/'.join(segments),
300 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000301
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000302def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000303 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000304
Tim Peterse1190062001-01-15 03:34:38 +0000305 Returns a tuple of the defragmented URL and the fragment. If
306 the URL contained no fragments, the second element is the
307 empty string.
308 """
Fred Drake5751a222001-11-16 02:52:57 +0000309 if '#' in url:
310 s, n, p, a, q, frag = urlparse(url)
311 defrag = urlunparse((s, n, p, a, q, ''))
312 return defrag, frag
313 else:
314 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000315
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200316try:
317 unicode
318except NameError:
319 def _is_unicode(x):
320 return 0
321else:
322 def _is_unicode(x):
323 return isinstance(x, unicode)
324
Facundo Batistac585df92008-09-03 22:35:50 +0000325# unquote method for parse_qs and parse_qsl
R. David Murraybfbdefe2010-05-25 15:20:46 +0000326# Cannot use directly from urllib as it would create a circular reference
327# because urllib uses urlparse methods (urljoin). If you update this function,
328# update it also in urllib. This code duplication does not existin in Python3.
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000329
330_hexdig = '0123456789ABCDEFabcdef'
R. David Murraybfbdefe2010-05-25 15:20:46 +0000331_hextochr = dict((a+b, chr(int(a+b,16)))
332 for a in _hexdig for b in _hexdig)
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200333_asciire = re.compile('([\x00-\x7f]+)')
Facundo Batistac585df92008-09-03 22:35:50 +0000334
335def unquote(s):
336 """unquote('abc%20def') -> 'abc def'."""
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200337 if _is_unicode(s):
338 if '%' not in s:
339 return s
340 bits = _asciire.split(s)
341 res = [bits[0]]
342 append = res.append
343 for i in range(1, len(bits), 2):
344 append(unquote(str(bits[i])).decode('latin1'))
345 append(bits[i + 1])
346 return ''.join(res)
347
348 bits = s.split('%')
R. David Murraybfbdefe2010-05-25 15:20:46 +0000349 # fastpath
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200350 if len(bits) == 1:
R. David Murraybfbdefe2010-05-25 15:20:46 +0000351 return s
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200352 res = [bits[0]]
353 append = res.append
354 for item in bits[1:]:
Facundo Batistac585df92008-09-03 22:35:50 +0000355 try:
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200356 append(_hextochr[item[:2]])
357 append(item[2:])
Facundo Batistac585df92008-09-03 22:35:50 +0000358 except KeyError:
Serhiy Storchaka923baea2013-03-14 21:31:09 +0200359 append('%')
360 append(item)
361 return ''.join(res)
Facundo Batistac585df92008-09-03 22:35:50 +0000362
363def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
364 """Parse a query given as a string argument.
365
366 Arguments:
367
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000368 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000369
370 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000371 percent-encoded queries should be treated as blank strings.
Facundo Batistac585df92008-09-03 22:35:50 +0000372 A true value indicates that blanks should be retained as
373 blank strings. The default false value indicates that
374 blank values are to be ignored and treated as if they were
375 not included.
376
377 strict_parsing: flag indicating what to do with parsing errors.
378 If false (the default), errors are silently ignored.
379 If true, errors raise a ValueError exception.
380 """
381 dict = {}
382 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
383 if name in dict:
384 dict[name].append(value)
385 else:
386 dict[name] = [value]
387 return dict
388
389def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
390 """Parse a query given as a string argument.
391
392 Arguments:
393
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000394 qs: percent-encoded query string to be parsed
Facundo Batistac585df92008-09-03 22:35:50 +0000395
396 keep_blank_values: flag indicating whether blank values in
Senthil Kumarana5ba05c2010-08-09 20:18:04 +0000397 percent-encoded queries should be treated as blank strings. A
Facundo Batistac585df92008-09-03 22:35:50 +0000398 true value indicates that blanks should be retained as blank
399 strings. The default false value indicates that blank values
400 are to be ignored and treated as if they were not included.
401
402 strict_parsing: flag indicating what to do with parsing errors. If
403 false (the default), errors are silently ignored. If true,
404 errors raise a ValueError exception.
405
406 Returns a list, as G-d intended.
407 """
408 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
409 r = []
410 for name_value in pairs:
411 if not name_value and not strict_parsing:
412 continue
413 nv = name_value.split('=', 1)
414 if len(nv) != 2:
415 if strict_parsing:
416 raise ValueError, "bad query field: %r" % (name_value,)
417 # Handle case of a control-name with no equal sign
418 if keep_blank_values:
419 nv.append('')
420 else:
421 continue
422 if len(nv[1]) or keep_blank_values:
423 name = unquote(nv[0].replace('+', ' '))
424 value = unquote(nv[1].replace('+', ' '))
425 r.append((name, value))
426
427 return r