blob: c5135c1f23545262686584519db60b6ec05d382f [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
Senthil Kumaranb87d04f2010-05-13 03:32:26 +00008RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +00009Berners-Lee, R. Fielding, and L. Masinter, August 1998.
10
Senthil Kumaranb87d04f2010-05-13 03:32:26 +000011RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +000012
13RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
141995.
15
Senthil Kumaranb87d04f2010-05-13 03:32:26 +000016RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +000017McCahill, December 1994
18
Senthil Kumaranb87d04f2010-05-13 03:32:26 +000019RFC 3986 is considered the current standard and any future changes to
20urlparse module should conform with it. The urlparse module is
21currently not entirely compliant with this RFC due to defacto
22scenarios for parsing, and for backward compatibility purposes, some
23parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranddc3ddd2010-04-17 14:33:55 +000024test_urlparse.py provides a good indicator of parsing behavior.
25
Guido van Rossume7b146f2000-02-04 15:28:42 +000026"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +000027
Fred Drakef606e8d2002-10-16 21:21:39 +000028__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +000029 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000030
Guido van Rossum23cb2a81994-09-12 10:36:35 +000031# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000032uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000033 'wais', 'file', 'https', 'shttp', 'mms',
34 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000035uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000036 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
37 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaranb87d04f2010-05-13 03:32:26 +000038 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000039non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000040 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000041uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000042 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000043 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000044uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000045 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000046uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000047 'nntp', 'wais', 'https', 'shttp', 'snews',
48 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000049
50# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000051scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
52 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
53 '0123456789'
54 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000055
Guido van Rossum74495401997-07-14 19:08:15 +000056MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000057_parse_cache = {}
58
59def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000060 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000061 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000062
63
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000064class ResultMixin(object):
65 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000066
67 @property
68 def username(self):
69 netloc = self.netloc
70 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000071 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000072 if ":" in userinfo:
73 userinfo = userinfo.split(":", 1)[0]
74 return userinfo
75 return None
76
77 @property
78 def password(self):
79 netloc = self.netloc
80 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000081 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000082 if ":" in userinfo:
83 return userinfo.split(":", 1)[1]
84 return None
85
86 @property
87 def hostname(self):
88 netloc = self.netloc
89 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000090 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +000091 if ":" in netloc:
92 netloc = netloc.split(":", 1)[0]
93 return netloc.lower() or None
94
95 @property
96 def port(self):
97 netloc = self.netloc
98 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000099 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +0000100 if ":" in netloc:
101 port = netloc.split(":", 1)[1]
102 return int(port, 10)
103 return None
104
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000105from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +0000106
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000107class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000108
109 __slots__ = ()
110
Fred Drakead5177c2006-04-01 22:14:43 +0000111 def geturl(self):
112 return urlunsplit(self)
113
114
Raymond Hettinger0f6a6562008-01-11 18:04:55 +0000115class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +0000116
117 __slots__ = ()
118
Fred Drakead5177c2006-04-01 22:14:43 +0000119 def geturl(self):
120 return urlunparse(self)
121
122
123def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000124 """Parse a URL into 6 components:
125 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
126 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
127 Note that we don't break the components up in smaller bits
128 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000129 tuple = urlsplit(url, scheme, allow_fragments)
130 scheme, netloc, url, query, fragment = tuple
131 if scheme in uses_params and ';' in url:
132 url, params = _splitparams(url)
133 else:
134 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000135 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000136
137def _splitparams(url):
138 if '/' in url:
139 i = url.find(';', url.rfind('/'))
140 if i < 0:
141 return url, ''
142 else:
143 i = url.find(';')
144 return url[:i], url[i+1:]
145
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000146def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000147 delim = len(url) # position of end of domain part of url, default is end
148 for c in '/?#': # look for delimiters; the order is NOT important
149 wdelim = url.find(c, start) # find first of this delim
150 if wdelim >= 0: # if found
151 delim = min(delim, wdelim) # use earliest delim position
152 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000153
Fred Drakead5177c2006-04-01 22:14:43 +0000154def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000155 """Parse a URL into 5 components:
156 <scheme>://<netloc>/<path>?<query>#<fragment>
157 Return a 5-tuple: (scheme, netloc, path, query, fragment).
158 Note that we don't break the components up in smaller bits
159 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000160 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000161 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000162 cached = _parse_cache.get(key, None)
163 if cached:
164 return cached
165 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
166 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000167 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000168 i = url.find(':')
169 if i > 0:
170 if url[:i] == 'http': # optimize the common case
171 scheme = url[:i].lower()
172 url = url[i+1:]
173 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000174 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000175 if allow_fragments and '#' in url:
176 url, fragment = url.split('#', 1)
177 if '?' in url:
178 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000179 v = SplitResult(scheme, netloc, url, query, fragment)
180 _parse_cache[key] = v
181 return v
Tim Peterse1190062001-01-15 03:34:38 +0000182 for c in url[:i]:
183 if c not in scheme_chars:
184 break
185 else:
186 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaranaaa210e2010-02-19 07:39:41 +0000187
188 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000189 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000190 if allow_fragments and scheme in uses_fragment and '#' in url:
191 url, fragment = url.split('#', 1)
192 if scheme in uses_query and '?' in url:
193 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000194 v = SplitResult(scheme, netloc, url, query, fragment)
195 _parse_cache[key] = v
196 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000197
Brett Cannon89318d82008-08-03 00:51:02 +0000198def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000199 """Put a parsed URL back together again. This may result in a
200 slightly different, but equivalent URL, if the URL that was parsed
201 originally had redundant delimiters, e.g. a ? with an empty query
202 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000203 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000204 if params:
205 url = "%s;%s" % (url, params)
206 return urlunsplit((scheme, netloc, url, query, fragment))
207
Brett Cannon89318d82008-08-03 00:51:02 +0000208def urlunsplit(data):
209 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000210 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000211 if url and url[:1] != '/': url = '/' + url
212 url = '//' + (netloc or '') + url
213 if scheme:
214 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000215 if query:
216 url = url + '?' + query
217 if fragment:
218 url = url + '#' + fragment
219 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000220
Fred Drakead5177c2006-04-01 22:14:43 +0000221def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000222 """Join a base URL and a possibly relative URL to form an absolute
223 interpretation of the latter."""
224 if not base:
225 return url
226 if not url:
227 return base
228 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
229 urlparse(base, '', allow_fragments)
230 scheme, netloc, path, params, query, fragment = \
231 urlparse(url, bscheme, allow_fragments)
232 if scheme != bscheme or scheme not in uses_relative:
233 return url
234 if scheme in uses_netloc:
235 if netloc:
236 return urlunparse((scheme, netloc, path,
237 params, query, fragment))
238 netloc = bnetloc
239 if path[:1] == '/':
240 return urlunparse((scheme, netloc, path,
241 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000242 if not path:
243 path = bpath
244 if not params:
245 params = bparams
246 else:
247 path = path[:-1]
248 return urlunparse((scheme, netloc, path,
249 params, query, fragment))
250 if not query:
251 query = bquery
252 return urlunparse((scheme, netloc, path,
253 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000254 segments = bpath.split('/')[:-1] + path.split('/')
255 # XXX The stuff below is bogus in various ways...
256 if segments[-1] == '.':
257 segments[-1] = ''
258 while '.' in segments:
259 segments.remove('.')
260 while 1:
261 i = 1
262 n = len(segments) - 1
263 while i < n:
264 if (segments[i] == '..'
265 and segments[i-1] not in ('', '..')):
266 del segments[i-1:i+1]
267 break
268 i = i+1
269 else:
270 break
271 if segments == ['', '..']:
272 segments[-1] = ''
273 elif len(segments) >= 2 and segments[-1] == '..':
274 segments[-2:] = ['']
275 return urlunparse((scheme, netloc, '/'.join(segments),
276 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000277
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000278def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000279 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000280
Tim Peterse1190062001-01-15 03:34:38 +0000281 Returns a tuple of the defragmented URL and the fragment. If
282 the URL contained no fragments, the second element is the
283 empty string.
284 """
Fred Drake5751a222001-11-16 02:52:57 +0000285 if '#' in url:
286 s, n, p, a, q, frag = urlparse(url)
287 defrag = urlunparse((s, n, p, a, q, ''))
288 return defrag, frag
289 else:
290 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000291
Facundo Batistac585df92008-09-03 22:35:50 +0000292# unquote method for parse_qs and parse_qsl
293# Cannot use directly from urllib as it would create circular reference.
294# urllib uses urlparse methods ( urljoin)
295
Senthil Kumaran34f92772010-03-29 19:30:44 +0000296
297_hexdig = '0123456789ABCDEFabcdef'
298_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000299
300def unquote(s):
301 """unquote('abc%20def') -> 'abc def'."""
302 res = s.split('%')
303 for i in xrange(1, len(res)):
304 item = res[i]
305 try:
306 res[i] = _hextochr[item[:2]] + item[2:]
307 except KeyError:
308 res[i] = '%' + item
309 except UnicodeDecodeError:
310 res[i] = unichr(int(item[:2], 16)) + item[2:]
311 return "".join(res)
312
313def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
314 """Parse a query given as a string argument.
315
316 Arguments:
317
318 qs: URL-encoded query string to be parsed
319
320 keep_blank_values: flag indicating whether blank values in
321 URL encoded queries should be treated as blank strings.
322 A true value indicates that blanks should be retained as
323 blank strings. The default false value indicates that
324 blank values are to be ignored and treated as if they were
325 not included.
326
327 strict_parsing: flag indicating what to do with parsing errors.
328 If false (the default), errors are silently ignored.
329 If true, errors raise a ValueError exception.
330 """
331 dict = {}
332 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
333 if name in dict:
334 dict[name].append(value)
335 else:
336 dict[name] = [value]
337 return dict
338
339def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
340 """Parse a query given as a string argument.
341
342 Arguments:
343
344 qs: URL-encoded query string to be parsed
345
346 keep_blank_values: flag indicating whether blank values in
347 URL encoded queries should be treated as blank strings. A
348 true value indicates that blanks should be retained as blank
349 strings. The default false value indicates that blank values
350 are to be ignored and treated as if they were not included.
351
352 strict_parsing: flag indicating what to do with parsing errors. If
353 false (the default), errors are silently ignored. If true,
354 errors raise a ValueError exception.
355
356 Returns a list, as G-d intended.
357 """
358 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
359 r = []
360 for name_value in pairs:
361 if not name_value and not strict_parsing:
362 continue
363 nv = name_value.split('=', 1)
364 if len(nv) != 2:
365 if strict_parsing:
366 raise ValueError, "bad query field: %r" % (name_value,)
367 # Handle case of a control-name with no equal sign
368 if keep_blank_values:
369 nv.append('')
370 else:
371 continue
372 if len(nv[1]) or keep_blank_values:
373 name = unquote(nv[0].replace('+', ' '))
374 value = unquote(nv[1].replace('+', ' '))
375 r.append((name, value))
376
377 return r
378
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000379
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000380test_input = """
381 http://a/b/c/d
382
383 g:h = <URL:g:h>
384 http:g = <URL:http://a/b/c/g>
385 http: = <URL:http://a/b/c/d>
386 g = <URL:http://a/b/c/g>
387 ./g = <URL:http://a/b/c/g>
388 g/ = <URL:http://a/b/c/g/>
389 /g = <URL:http://a/g>
390 //g = <URL:http://g>
391 ?y = <URL:http://a/b/c/d?y>
392 g?y = <URL:http://a/b/c/g?y>
393 g?y/./x = <URL:http://a/b/c/g?y/./x>
394 . = <URL:http://a/b/c/>
395 ./ = <URL:http://a/b/c/>
396 .. = <URL:http://a/b/>
397 ../ = <URL:http://a/b/>
398 ../g = <URL:http://a/b/g>
399 ../.. = <URL:http://a/>
400 ../../g = <URL:http://a/g>
401 ../../../g = <URL:http://a/../g>
402 ./../g = <URL:http://a/b/g>
403 ./g/. = <URL:http://a/b/c/g/>
404 /./g = <URL:http://a/./g>
405 g/./h = <URL:http://a/b/c/g/h>
406 g/../h = <URL:http://a/b/c/h>
407 http:g = <URL:http://a/b/c/g>
408 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000409 http:?y = <URL:http://a/b/c/d?y>
410 http:g?y = <URL:http://a/b/c/g?y>
411 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000412"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000413
414def test():
Tim Peterse1190062001-01-15 03:34:38 +0000415 import sys
416 base = ''
417 if sys.argv[1:]:
418 fn = sys.argv[1]
419 if fn == '-':
420 fp = sys.stdin
421 else:
422 fp = open(fn)
423 else:
Raymond Hettingera6172712004-12-31 19:15:26 +0000424 try:
425 from cStringIO import StringIO
426 except ImportError:
427 from StringIO import StringIO
428 fp = StringIO(test_input)
Georg Brandl1ea8cb42008-01-21 17:22:06 +0000429 for line in fp:
Tim Peterse1190062001-01-15 03:34:38 +0000430 words = line.split()
431 if not words:
432 continue
433 url = words[0]
434 parts = urlparse(url)
435 print '%-10s : %s' % (url, parts)
436 abs = urljoin(base, url)
437 if not base:
438 base = abs
439 wrapped = '<URL:%s>' % abs
440 print '%-10s = %s' % (url, wrapped)
441 if len(words) == 3 and words[1] == '=':
442 if wrapped != words[2]:
443 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000444
445if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000446 test()