blob: c169f380049d35dfb0a85267e2690bc0356c2a92 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +00006
Fred Drakef606e8d2002-10-16 21:21:39 +00007__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +00008 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +00009
Guido van Rossum23cb2a81994-09-12 10:36:35 +000010# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000011uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000012 'wais', 'file', 'https', 'shttp', 'mms',
13 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000014uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000015 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
16 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran5e95e762009-03-30 21:51:50 +000017 'svn', 'svn+ssh', 'sftp','nfs']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000018non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000019 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000020uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000021 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000022 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000023uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000024 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000025uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000026 'nntp', 'wais', 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000028
29# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000030scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
32 '0123456789'
33 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034
Guido van Rossum74495401997-07-14 19:08:15 +000035MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000036_parse_cache = {}
37
38def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000039 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000040 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000041
42
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000043class ResultMixin(object):
44 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000045
46 @property
47 def username(self):
48 netloc = self.netloc
49 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000050 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000051 if ":" in userinfo:
52 userinfo = userinfo.split(":", 1)[0]
53 return userinfo
54 return None
55
56 @property
57 def password(self):
58 netloc = self.netloc
59 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000060 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000061 if ":" in userinfo:
62 return userinfo.split(":", 1)[1]
63 return None
64
65 @property
66 def hostname(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000067 netloc = self.netloc.split('@')[-1]
68 if '[' in netloc and ']' in netloc:
69 return netloc.split(']')[0][1:].lower()
70 elif '[' in netloc or ']' in netloc:
71 raise ValueError("Invalid IPv6 hostname")
72 elif ':' in netloc:
73 return netloc.split(':')[0].lower()
74 elif netloc == '':
75 return None
76 else:
77 return netloc.lower()
Fred Drakead5177c2006-04-01 22:14:43 +000078
79 @property
80 def port(self):
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000081 netloc = self.netloc.split('@')[-1].split(']')[-1]
82 if ':' in netloc:
83 port = netloc.split(':')[1]
Fred Drakead5177c2006-04-01 22:14:43 +000084 return int(port, 10)
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +000085 else:
86 return None
Fred Drakead5177c2006-04-01 22:14:43 +000087
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000088from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +000089
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000090class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +000091
92 __slots__ = ()
93
Fred Drakead5177c2006-04-01 22:14:43 +000094 def geturl(self):
95 return urlunsplit(self)
96
97
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000098class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +000099
100 __slots__ = ()
101
Fred Drakead5177c2006-04-01 22:14:43 +0000102 def geturl(self):
103 return urlunparse(self)
104
105
106def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000107 """Parse a URL into 6 components:
108 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
109 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
110 Note that we don't break the components up in smaller bits
111 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000112 tuple = urlsplit(url, scheme, allow_fragments)
113 scheme, netloc, url, query, fragment = tuple
114 if scheme in uses_params and ';' in url:
115 url, params = _splitparams(url)
116 else:
117 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000118 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000119
120def _splitparams(url):
121 if '/' in url:
122 i = url.find(';', url.rfind('/'))
123 if i < 0:
124 return url, ''
125 else:
126 i = url.find(';')
127 return url[:i], url[i+1:]
128
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000129def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000130 delim = len(url) # position of end of domain part of url, default is end
Senthil Kumaran8c6d9d72010-04-16 02:46:46 +0000131 if '[' in url: # check for invalid IPv6 URL
132 if not ']' in url: raise ValueError("Invalid IPv6 URL")
133 elif ']' in url:
134 if not '[' in url: raise ValueError("Invalid IPv6 URL")
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000135 for c in '/?#': # look for delimiters; the order is NOT important
136 wdelim = url.find(c, start) # find first of this delim
137 if wdelim >= 0: # if found
138 delim = min(delim, wdelim) # use earliest delim position
139 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000140
Fred Drakead5177c2006-04-01 22:14:43 +0000141def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000142 """Parse a URL into 5 components:
143 <scheme>://<netloc>/<path>?<query>#<fragment>
144 Return a 5-tuple: (scheme, netloc, path, query, fragment).
145 Note that we don't break the components up in smaller bits
146 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000147 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000148 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000149 cached = _parse_cache.get(key, None)
150 if cached:
151 return cached
152 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
153 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000154 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000155 i = url.find(':')
156 if i > 0:
157 if url[:i] == 'http': # optimize the common case
158 scheme = url[:i].lower()
159 url = url[i+1:]
160 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000161 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000162 if allow_fragments and '#' in url:
163 url, fragment = url.split('#', 1)
164 if '?' in url:
165 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000166 v = SplitResult(scheme, netloc, url, query, fragment)
167 _parse_cache[key] = v
168 return v
Tim Peterse1190062001-01-15 03:34:38 +0000169 for c in url[:i]:
170 if c not in scheme_chars:
171 break
172 else:
173 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000174
175 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000176 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000177 if allow_fragments and scheme in uses_fragment and '#' in url:
178 url, fragment = url.split('#', 1)
179 if scheme in uses_query and '?' in url:
180 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000181 v = SplitResult(scheme, netloc, url, query, fragment)
182 _parse_cache[key] = v
183 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000184
Brett Cannon89318d82008-08-03 00:51:02 +0000185def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000186 """Put a parsed URL back together again. This may result in a
187 slightly different, but equivalent URL, if the URL that was parsed
188 originally had redundant delimiters, e.g. a ? with an empty query
189 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000190 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000191 if params:
192 url = "%s;%s" % (url, params)
193 return urlunsplit((scheme, netloc, url, query, fragment))
194
Brett Cannon89318d82008-08-03 00:51:02 +0000195def urlunsplit(data):
196 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000197 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000198 if url and url[:1] != '/': url = '/' + url
199 url = '//' + (netloc or '') + url
200 if scheme:
201 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000202 if query:
203 url = url + '?' + query
204 if fragment:
205 url = url + '#' + fragment
206 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000207
Fred Drakead5177c2006-04-01 22:14:43 +0000208def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000209 """Join a base URL and a possibly relative URL to form an absolute
210 interpretation of the latter."""
211 if not base:
212 return url
213 if not url:
214 return base
215 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
216 urlparse(base, '', allow_fragments)
217 scheme, netloc, path, params, query, fragment = \
218 urlparse(url, bscheme, allow_fragments)
219 if scheme != bscheme or scheme not in uses_relative:
220 return url
221 if scheme in uses_netloc:
222 if netloc:
223 return urlunparse((scheme, netloc, path,
224 params, query, fragment))
225 netloc = bnetloc
226 if path[:1] == '/':
227 return urlunparse((scheme, netloc, path,
228 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000229 if not path:
230 path = bpath
231 if not params:
232 params = bparams
233 else:
234 path = path[:-1]
235 return urlunparse((scheme, netloc, path,
236 params, query, fragment))
237 if not query:
238 query = bquery
239 return urlunparse((scheme, netloc, path,
240 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000241 segments = bpath.split('/')[:-1] + path.split('/')
242 # XXX The stuff below is bogus in various ways...
243 if segments[-1] == '.':
244 segments[-1] = ''
245 while '.' in segments:
246 segments.remove('.')
247 while 1:
248 i = 1
249 n = len(segments) - 1
250 while i < n:
251 if (segments[i] == '..'
252 and segments[i-1] not in ('', '..')):
253 del segments[i-1:i+1]
254 break
255 i = i+1
256 else:
257 break
258 if segments == ['', '..']:
259 segments[-1] = ''
260 elif len(segments) >= 2 and segments[-1] == '..':
261 segments[-2:] = ['']
262 return urlunparse((scheme, netloc, '/'.join(segments),
263 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000264
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000265def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000266 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000267
Tim Peterse1190062001-01-15 03:34:38 +0000268 Returns a tuple of the defragmented URL and the fragment. If
269 the URL contained no fragments, the second element is the
270 empty string.
271 """
Fred Drake5751a222001-11-16 02:52:57 +0000272 if '#' in url:
273 s, n, p, a, q, frag = urlparse(url)
274 defrag = urlunparse((s, n, p, a, q, ''))
275 return defrag, frag
276 else:
277 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000278
Facundo Batistac585df92008-09-03 22:35:50 +0000279# unquote method for parse_qs and parse_qsl
280# Cannot use directly from urllib as it would create circular reference.
281# urllib uses urlparse methods ( urljoin)
282
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000283
284_hexdig = '0123456789ABCDEFabcdef'
285_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000286
287def unquote(s):
288 """unquote('abc%20def') -> 'abc def'."""
289 res = s.split('%')
290 for i in xrange(1, len(res)):
291 item = res[i]
292 try:
293 res[i] = _hextochr[item[:2]] + item[2:]
294 except KeyError:
295 res[i] = '%' + item
296 except UnicodeDecodeError:
297 res[i] = unichr(int(item[:2], 16)) + item[2:]
298 return "".join(res)
299
300def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
301 """Parse a query given as a string argument.
302
303 Arguments:
304
305 qs: URL-encoded query string to be parsed
306
307 keep_blank_values: flag indicating whether blank values in
308 URL encoded queries should be treated as blank strings.
309 A true value indicates that blanks should be retained as
310 blank strings. The default false value indicates that
311 blank values are to be ignored and treated as if they were
312 not included.
313
314 strict_parsing: flag indicating what to do with parsing errors.
315 If false (the default), errors are silently ignored.
316 If true, errors raise a ValueError exception.
317 """
318 dict = {}
319 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
320 if name in dict:
321 dict[name].append(value)
322 else:
323 dict[name] = [value]
324 return dict
325
326def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
327 """Parse a query given as a string argument.
328
329 Arguments:
330
331 qs: URL-encoded query string to be parsed
332
333 keep_blank_values: flag indicating whether blank values in
334 URL encoded queries should be treated as blank strings. A
335 true value indicates that blanks should be retained as blank
336 strings. The default false value indicates that blank values
337 are to be ignored and treated as if they were not included.
338
339 strict_parsing: flag indicating what to do with parsing errors. If
340 false (the default), errors are silently ignored. If true,
341 errors raise a ValueError exception.
342
343 Returns a list, as G-d intended.
344 """
345 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
346 r = []
347 for name_value in pairs:
348 if not name_value and not strict_parsing:
349 continue
350 nv = name_value.split('=', 1)
351 if len(nv) != 2:
352 if strict_parsing:
353 raise ValueError, "bad query field: %r" % (name_value,)
354 # Handle case of a control-name with no equal sign
355 if keep_blank_values:
356 nv.append('')
357 else:
358 continue
359 if len(nv[1]) or keep_blank_values:
360 name = unquote(nv[0].replace('+', ' '))
361 value = unquote(nv[1].replace('+', ' '))
362 r.append((name, value))
363
364 return r
365
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000366
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000367test_input = """
368 http://a/b/c/d
369
370 g:h = <URL:g:h>
371 http:g = <URL:http://a/b/c/g>
372 http: = <URL:http://a/b/c/d>
373 g = <URL:http://a/b/c/g>
374 ./g = <URL:http://a/b/c/g>
375 g/ = <URL:http://a/b/c/g/>
376 /g = <URL:http://a/g>
377 //g = <URL:http://g>
378 ?y = <URL:http://a/b/c/d?y>
379 g?y = <URL:http://a/b/c/g?y>
380 g?y/./x = <URL:http://a/b/c/g?y/./x>
381 . = <URL:http://a/b/c/>
382 ./ = <URL:http://a/b/c/>
383 .. = <URL:http://a/b/>
384 ../ = <URL:http://a/b/>
385 ../g = <URL:http://a/b/g>
386 ../.. = <URL:http://a/>
387 ../../g = <URL:http://a/g>
388 ../../../g = <URL:http://a/../g>
389 ./../g = <URL:http://a/b/g>
390 ./g/. = <URL:http://a/b/c/g/>
391 /./g = <URL:http://a/./g>
392 g/./h = <URL:http://a/b/c/g/h>
393 g/../h = <URL:http://a/b/c/h>
394 http:g = <URL:http://a/b/c/g>
395 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000396 http:?y = <URL:http://a/b/c/d?y>
397 http:g?y = <URL:http://a/b/c/g?y>
398 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000399"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000400
401def test():
Tim Peterse1190062001-01-15 03:34:38 +0000402 import sys
403 base = ''
404 if sys.argv[1:]:
405 fn = sys.argv[1]
406 if fn == '-':
407 fp = sys.stdin
408 else:
409 fp = open(fn)
410 else:
Raymond Hettingera6172712004-12-31 19:15:26 +0000411 try:
412 from cStringIO import StringIO
413 except ImportError:
414 from StringIO import StringIO
415 fp = StringIO(test_input)
Georg Brandl1ea8cb42008-01-21 17:22:06 +0000416 for line in fp:
Tim Peterse1190062001-01-15 03:34:38 +0000417 words = line.split()
418 if not words:
419 continue
420 url = words[0]
421 parts = urlparse(url)
422 print '%-10s : %s' % (url, parts)
423 abs = urljoin(base, url)
424 if not base:
425 base = abs
426 wrapped = '<URL:%s>' % abs
427 print '%-10s = %s' % (url, wrapped)
428 if len(words) == 3 and words[1] == '=':
429 if wrapped != words[2]:
430 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000431
432if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000433 test()