blob: 294545c8c090d368f277e35fa63d0b7724393011 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
Senthil Kumaranb87d04f2010-05-13 03:32:26 +00008RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +00009Berners-Lee, R. Fielding, and L. Masinter, August 1998.
10
Senthil Kumaranb87d04f2010-05-13 03:32:26 +000011RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +000012
13RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
141995.
15
Senthil Kumaranb87d04f2010-05-13 03:32:26 +000016RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +000017McCahill, December 1994
18
Senthil Kumaranb87d04f2010-05-13 03:32:26 +000019RFC 3986 is considered the current standard and any future changes to
20urlparse module should conform with it. The urlparse module is
21currently not entirely compliant with this RFC due to defacto
22scenarios for parsing, and for backward compatibility purposes, some
23parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +000024test_urlparse.py provides a good indicator of parsing behavior.
25
Guido van Rossume7b146f2000-02-04 15:28:42 +000026"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000027
Fred Drakef606e8d2002-10-16 21:21:39 +000028__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000029 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000030
Guido van Rossum23cb2a81994-09-12 10:36:35 +000031# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000032uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000033 'wais', 'file', 'https', 'shttp', 'mms',
34 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000035uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000036 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
37 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaranb87d04f2010-05-13 03:32:26 +000038 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000039non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000040 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000041uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000042 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000043 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000044uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000045 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000046uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000047 'nntp', 'wais', 'https', 'shttp', 'snews',
48 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000049
50# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000051scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
52 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
53 '0123456789'
54 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000055
Guido van Rossum74495401997-07-14 19:08:15 +000056MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000057_parse_cache = {}
58
59def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000060 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000061 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000062
63
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000064class ResultMixin(object):
65 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000066
67 @property
68 def username(self):
69 netloc = self.netloc
70 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000071 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000072 if ":" in userinfo:
73 userinfo = userinfo.split(":", 1)[0]
74 return userinfo
75 return None
76
77 @property
78 def password(self):
79 netloc = self.netloc
80 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000081 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000082 if ":" in userinfo:
83 return userinfo.split(":", 1)[1]
84 return None
85
86 @property
87 def hostname(self):
88 netloc = self.netloc
89 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000090 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +000091 if ":" in netloc:
92 netloc = netloc.split(":", 1)[0]
93 return netloc.lower() or None
94
95 @property
96 def port(self):
97 netloc = self.netloc
98 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000099 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +0000100 if ":" in netloc:
101 port = netloc.split(":", 1)[1]
102 return int(port, 10)
103 return None
104
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000105from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000106
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000107class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000108
109 __slots__ = ()
110
Fred Drakead5177c2006-04-01 22:14:43 +0000111 def geturl(self):
112 return urlunsplit(self)
113
114
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000115class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000116
117 __slots__ = ()
118
Fred Drakead5177c2006-04-01 22:14:43 +0000119 def geturl(self):
120 return urlunparse(self)
121
122
123def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000124 """Parse a URL into 6 components:
125 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
126 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
127 Note that we don't break the components up in smaller bits
128 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000129 tuple = urlsplit(url, scheme, allow_fragments)
130 scheme, netloc, url, query, fragment = tuple
131 if scheme in uses_params and ';' in url:
132 url, params = _splitparams(url)
133 else:
134 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000135 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000136
137def _splitparams(url):
138 if '/' in url:
139 i = url.find(';', url.rfind('/'))
140 if i < 0:
141 return url, ''
142 else:
143 i = url.find(';')
144 return url[:i], url[i+1:]
145
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000146def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000147 delim = len(url) # position of end of domain part of url, default is end
148 for c in '/?#': # look for delimiters; the order is NOT important
149 wdelim = url.find(c, start) # find first of this delim
150 if wdelim >= 0: # if found
151 delim = min(delim, wdelim) # use earliest delim position
152 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000153
Fred Drakead5177c2006-04-01 22:14:43 +0000154def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000155 """Parse a URL into 5 components:
156 <scheme>://<netloc>/<path>?<query>#<fragment>
157 Return a 5-tuple: (scheme, netloc, path, query, fragment).
158 Note that we don't break the components up in smaller bits
159 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000160 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000161 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000162 cached = _parse_cache.get(key, None)
163 if cached:
164 return cached
165 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
166 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000167 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000168 i = url.find(':')
169 if i > 0:
170 if url[:i] == 'http': # optimize the common case
171 scheme = url[:i].lower()
172 url = url[i+1:]
173 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000174 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000175 if allow_fragments and '#' in url:
176 url, fragment = url.split('#', 1)
177 if '?' in url:
178 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000179 v = SplitResult(scheme, netloc, url, query, fragment)
180 _parse_cache[key] = v
181 return v
Tim Peterse1190062001-01-15 03:34:38 +0000182 for c in url[:i]:
183 if c not in scheme_chars:
184 break
185 else:
186 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaranaaa210e2010-02-19 07:39:41 +0000187
188 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000189 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000190 if allow_fragments and scheme in uses_fragment and '#' in url:
191 url, fragment = url.split('#', 1)
192 if scheme in uses_query and '?' in url:
193 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000194 v = SplitResult(scheme, netloc, url, query, fragment)
195 _parse_cache[key] = v
196 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000197
Brett Cannon89318d82008-08-03 00:51:02 +0000198def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000199 """Put a parsed URL back together again. This may result in a
200 slightly different, but equivalent URL, if the URL that was parsed
201 originally had redundant delimiters, e.g. a ? with an empty query
202 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000203 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000204 if params:
205 url = "%s;%s" % (url, params)
206 return urlunsplit((scheme, netloc, url, query, fragment))
207
Brett Cannon89318d82008-08-03 00:51:02 +0000208def urlunsplit(data):
Senthil Kumaran6303ec92010-06-28 13:59:49 +0000209 """Combine the elements of a tuple as returned by urlsplit() into a
210 complete URL as a string. The data argument can be any five-item iterable.
211 This may result in a slightly different, but equivalent URL, if the URL that
212 was parsed originally had unnecessary delimiters (for example, a ? with an
213 empty query; the RFC states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000214 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000215 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000216 if url and url[:1] != '/': url = '/' + url
217 url = '//' + (netloc or '') + url
218 if scheme:
219 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000220 if query:
221 url = url + '?' + query
222 if fragment:
223 url = url + '#' + fragment
224 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000225
Fred Drakead5177c2006-04-01 22:14:43 +0000226def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000227 """Join a base URL and a possibly relative URL to form an absolute
228 interpretation of the latter."""
229 if not base:
230 return url
231 if not url:
232 return base
233 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
234 urlparse(base, '', allow_fragments)
235 scheme, netloc, path, params, query, fragment = \
236 urlparse(url, bscheme, allow_fragments)
237 if scheme != bscheme or scheme not in uses_relative:
238 return url
239 if scheme in uses_netloc:
240 if netloc:
241 return urlunparse((scheme, netloc, path,
242 params, query, fragment))
243 netloc = bnetloc
244 if path[:1] == '/':
245 return urlunparse((scheme, netloc, path,
246 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000247 if not path:
248 path = bpath
249 if not params:
250 params = bparams
251 else:
252 path = path[:-1]
253 return urlunparse((scheme, netloc, path,
254 params, query, fragment))
255 if not query:
256 query = bquery
257 return urlunparse((scheme, netloc, path,
258 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000259 segments = bpath.split('/')[:-1] + path.split('/')
260 # XXX The stuff below is bogus in various ways...
261 if segments[-1] == '.':
262 segments[-1] = ''
263 while '.' in segments:
264 segments.remove('.')
265 while 1:
266 i = 1
267 n = len(segments) - 1
268 while i < n:
269 if (segments[i] == '..'
270 and segments[i-1] not in ('', '..')):
271 del segments[i-1:i+1]
272 break
273 i = i+1
274 else:
275 break
276 if segments == ['', '..']:
277 segments[-1] = ''
278 elif len(segments) >= 2 and segments[-1] == '..':
279 segments[-2:] = ['']
280 return urlunparse((scheme, netloc, '/'.join(segments),
281 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000282
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000283def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000284 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000285
Tim Peterse1190062001-01-15 03:34:38 +0000286 Returns a tuple of the defragmented URL and the fragment. If
287 the URL contained no fragments, the second element is the
288 empty string.
289 """
Fred Drake5751a222001-11-16 02:52:57 +0000290 if '#' in url:
291 s, n, p, a, q, frag = urlparse(url)
292 defrag = urlunparse((s, n, p, a, q, ''))
293 return defrag, frag
294 else:
295 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000296
Facundo Batistac585df92008-09-03 22:35:50 +0000297# unquote method for parse_qs and parse_qsl
298# Cannot use directly from urllib as it would create circular reference.
299# urllib uses urlparse methods ( urljoin)
300
Senthil Kumaran34f92772010-03-29 19:30:44 +0000301
302_hexdig = '0123456789ABCDEFabcdef'
303_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000304
305def unquote(s):
306 """unquote('abc%20def') -> 'abc def'."""
307 res = s.split('%')
308 for i in xrange(1, len(res)):
309 item = res[i]
310 try:
311 res[i] = _hextochr[item[:2]] + item[2:]
312 except KeyError:
313 res[i] = '%' + item
314 except UnicodeDecodeError:
315 res[i] = unichr(int(item[:2], 16)) + item[2:]
316 return "".join(res)
317
318def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
319 """Parse a query given as a string argument.
320
321 Arguments:
322
323 qs: URL-encoded query string to be parsed
324
325 keep_blank_values: flag indicating whether blank values in
326 URL encoded queries should be treated as blank strings.
327 A true value indicates that blanks should be retained as
328 blank strings. The default false value indicates that
329 blank values are to be ignored and treated as if they were
330 not included.
331
332 strict_parsing: flag indicating what to do with parsing errors.
333 If false (the default), errors are silently ignored.
334 If true, errors raise a ValueError exception.
335 """
336 dict = {}
337 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
338 if name in dict:
339 dict[name].append(value)
340 else:
341 dict[name] = [value]
342 return dict
343
344def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
345 """Parse a query given as a string argument.
346
347 Arguments:
348
349 qs: URL-encoded query string to be parsed
350
351 keep_blank_values: flag indicating whether blank values in
352 URL encoded queries should be treated as blank strings. A
353 true value indicates that blanks should be retained as blank
354 strings. The default false value indicates that blank values
355 are to be ignored and treated as if they were not included.
356
357 strict_parsing: flag indicating what to do with parsing errors. If
358 false (the default), errors are silently ignored. If true,
359 errors raise a ValueError exception.
360
361 Returns a list, as G-d intended.
362 """
363 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
364 r = []
365 for name_value in pairs:
366 if not name_value and not strict_parsing:
367 continue
368 nv = name_value.split('=', 1)
369 if len(nv) != 2:
370 if strict_parsing:
371 raise ValueError, "bad query field: %r" % (name_value,)
372 # Handle case of a control-name with no equal sign
373 if keep_blank_values:
374 nv.append('')
375 else:
376 continue
377 if len(nv[1]) or keep_blank_values:
378 name = unquote(nv[0].replace('+', ' '))
379 value = unquote(nv[1].replace('+', ' '))
380 r.append((name, value))
381
382 return r