blob: f6e20afdb8bdb31996cd78dc8877344cccf24581 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +00006
Fred Drakef606e8d2002-10-16 21:21:39 +00007__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +00008 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +00009
Guido van Rossum23cb2a81994-09-12 10:36:35 +000010# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000011uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000012 'wais', 'file', 'https', 'shttp', 'mms',
13 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000014uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000015 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
16 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran5e95e762009-03-30 21:51:50 +000017 'svn', 'svn+ssh', 'sftp','nfs']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000018non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000019 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000020uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000021 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000022 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000023uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000024 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000025uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000026 'nntp', 'wais', 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000028
29# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000030scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
32 '0123456789'
33 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034
Guido van Rossum74495401997-07-14 19:08:15 +000035MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000036_parse_cache = {}
37
38def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000039 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000040 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000041
42
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000043class ResultMixin(object):
44 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000045
46 @property
47 def username(self):
48 netloc = self.netloc
49 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000050 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000051 if ":" in userinfo:
52 userinfo = userinfo.split(":", 1)[0]
53 return userinfo
54 return None
55
56 @property
57 def password(self):
58 netloc = self.netloc
59 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000060 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000061 if ":" in userinfo:
62 return userinfo.split(":", 1)[1]
63 return None
64
65 @property
66 def hostname(self):
67 netloc = self.netloc
68 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000069 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +000070 if ":" in netloc:
71 netloc = netloc.split(":", 1)[0]
72 return netloc.lower() or None
73
74 @property
75 def port(self):
76 netloc = self.netloc
77 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000078 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +000079 if ":" in netloc:
80 port = netloc.split(":", 1)[1]
81 return int(port, 10)
82 return None
83
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000084from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +000085
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000086class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +000087
88 __slots__ = ()
89
Fred Drakead5177c2006-04-01 22:14:43 +000090 def geturl(self):
91 return urlunsplit(self)
92
93
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000094class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +000095
96 __slots__ = ()
97
Fred Drakead5177c2006-04-01 22:14:43 +000098 def geturl(self):
99 return urlunparse(self)
100
101
102def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000103 """Parse a URL into 6 components:
104 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
105 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
106 Note that we don't break the components up in smaller bits
107 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000108 tuple = urlsplit(url, scheme, allow_fragments)
109 scheme, netloc, url, query, fragment = tuple
110 if scheme in uses_params and ';' in url:
111 url, params = _splitparams(url)
112 else:
113 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000114 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000115
116def _splitparams(url):
117 if '/' in url:
118 i = url.find(';', url.rfind('/'))
119 if i < 0:
120 return url, ''
121 else:
122 i = url.find(';')
123 return url[:i], url[i+1:]
124
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000125def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000126 delim = len(url) # position of end of domain part of url, default is end
127 for c in '/?#': # look for delimiters; the order is NOT important
128 wdelim = url.find(c, start) # find first of this delim
129 if wdelim >= 0: # if found
130 delim = min(delim, wdelim) # use earliest delim position
131 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000132
Fred Drakead5177c2006-04-01 22:14:43 +0000133def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000134 """Parse a URL into 5 components:
135 <scheme>://<netloc>/<path>?<query>#<fragment>
136 Return a 5-tuple: (scheme, netloc, path, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000139 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000140 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000141 cached = _parse_cache.get(key, None)
142 if cached:
143 return cached
144 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
145 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000146 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000147 i = url.find(':')
148 if i > 0:
149 if url[:i] == 'http': # optimize the common case
150 scheme = url[:i].lower()
151 url = url[i+1:]
152 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000153 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000154 if allow_fragments and '#' in url:
155 url, fragment = url.split('#', 1)
156 if '?' in url:
157 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000158 v = SplitResult(scheme, netloc, url, query, fragment)
159 _parse_cache[key] = v
160 return v
Tim Peterse1190062001-01-15 03:34:38 +0000161 for c in url[:i]:
162 if c not in scheme_chars:
163 break
164 else:
165 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000166
167 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000168 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000169 if allow_fragments and scheme in uses_fragment and '#' in url:
170 url, fragment = url.split('#', 1)
171 if scheme in uses_query and '?' in url:
172 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000173 v = SplitResult(scheme, netloc, url, query, fragment)
174 _parse_cache[key] = v
175 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000176
Brett Cannon89318d82008-08-03 00:51:02 +0000177def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000178 """Put a parsed URL back together again. This may result in a
179 slightly different, but equivalent URL, if the URL that was parsed
180 originally had redundant delimiters, e.g. a ? with an empty query
181 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000182 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000183 if params:
184 url = "%s;%s" % (url, params)
185 return urlunsplit((scheme, netloc, url, query, fragment))
186
Brett Cannon89318d82008-08-03 00:51:02 +0000187def urlunsplit(data):
188 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000189 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000190 if url and url[:1] != '/': url = '/' + url
191 url = '//' + (netloc or '') + url
192 if scheme:
193 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000194 if query:
195 url = url + '?' + query
196 if fragment:
197 url = url + '#' + fragment
198 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000199
Fred Drakead5177c2006-04-01 22:14:43 +0000200def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000201 """Join a base URL and a possibly relative URL to form an absolute
202 interpretation of the latter."""
203 if not base:
204 return url
205 if not url:
206 return base
207 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
208 urlparse(base, '', allow_fragments)
209 scheme, netloc, path, params, query, fragment = \
210 urlparse(url, bscheme, allow_fragments)
211 if scheme != bscheme or scheme not in uses_relative:
212 return url
213 if scheme in uses_netloc:
214 if netloc:
215 return urlunparse((scheme, netloc, path,
216 params, query, fragment))
217 netloc = bnetloc
218 if path[:1] == '/':
219 return urlunparse((scheme, netloc, path,
220 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000221 if not path:
222 path = bpath
223 if not params:
224 params = bparams
225 else:
226 path = path[:-1]
227 return urlunparse((scheme, netloc, path,
228 params, query, fragment))
229 if not query:
230 query = bquery
231 return urlunparse((scheme, netloc, path,
232 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000233 segments = bpath.split('/')[:-1] + path.split('/')
234 # XXX The stuff below is bogus in various ways...
235 if segments[-1] == '.':
236 segments[-1] = ''
237 while '.' in segments:
238 segments.remove('.')
239 while 1:
240 i = 1
241 n = len(segments) - 1
242 while i < n:
243 if (segments[i] == '..'
244 and segments[i-1] not in ('', '..')):
245 del segments[i-1:i+1]
246 break
247 i = i+1
248 else:
249 break
250 if segments == ['', '..']:
251 segments[-1] = ''
252 elif len(segments) >= 2 and segments[-1] == '..':
253 segments[-2:] = ['']
254 return urlunparse((scheme, netloc, '/'.join(segments),
255 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000256
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000257def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000258 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000259
Tim Peterse1190062001-01-15 03:34:38 +0000260 Returns a tuple of the defragmented URL and the fragment. If
261 the URL contained no fragments, the second element is the
262 empty string.
263 """
Fred Drake5751a222001-11-16 02:52:57 +0000264 if '#' in url:
265 s, n, p, a, q, frag = urlparse(url)
266 defrag = urlunparse((s, n, p, a, q, ''))
267 return defrag, frag
268 else:
269 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000270
Facundo Batistac585df92008-09-03 22:35:50 +0000271# unquote method for parse_qs and parse_qsl
272# Cannot use directly from urllib as it would create circular reference.
273# urllib uses urlparse methods ( urljoin)
274
Senthil Kumaranf3e9b2a2010-03-18 12:14:15 +0000275
276_hexdig = '0123456789ABCDEFabcdef'
277_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
Facundo Batistac585df92008-09-03 22:35:50 +0000278
279def unquote(s):
280 """unquote('abc%20def') -> 'abc def'."""
281 res = s.split('%')
282 for i in xrange(1, len(res)):
283 item = res[i]
284 try:
285 res[i] = _hextochr[item[:2]] + item[2:]
286 except KeyError:
287 res[i] = '%' + item
288 except UnicodeDecodeError:
289 res[i] = unichr(int(item[:2], 16)) + item[2:]
290 return "".join(res)
291
292def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
293 """Parse a query given as a string argument.
294
295 Arguments:
296
297 qs: URL-encoded query string to be parsed
298
299 keep_blank_values: flag indicating whether blank values in
300 URL encoded queries should be treated as blank strings.
301 A true value indicates that blanks should be retained as
302 blank strings. The default false value indicates that
303 blank values are to be ignored and treated as if they were
304 not included.
305
306 strict_parsing: flag indicating what to do with parsing errors.
307 If false (the default), errors are silently ignored.
308 If true, errors raise a ValueError exception.
309 """
310 dict = {}
311 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
312 if name in dict:
313 dict[name].append(value)
314 else:
315 dict[name] = [value]
316 return dict
317
318def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
319 """Parse a query given as a string argument.
320
321 Arguments:
322
323 qs: URL-encoded query string to be parsed
324
325 keep_blank_values: flag indicating whether blank values in
326 URL encoded queries should be treated as blank strings. A
327 true value indicates that blanks should be retained as blank
328 strings. The default false value indicates that blank values
329 are to be ignored and treated as if they were not included.
330
331 strict_parsing: flag indicating what to do with parsing errors. If
332 false (the default), errors are silently ignored. If true,
333 errors raise a ValueError exception.
334
335 Returns a list, as G-d intended.
336 """
337 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
338 r = []
339 for name_value in pairs:
340 if not name_value and not strict_parsing:
341 continue
342 nv = name_value.split('=', 1)
343 if len(nv) != 2:
344 if strict_parsing:
345 raise ValueError, "bad query field: %r" % (name_value,)
346 # Handle case of a control-name with no equal sign
347 if keep_blank_values:
348 nv.append('')
349 else:
350 continue
351 if len(nv[1]) or keep_blank_values:
352 name = unquote(nv[0].replace('+', ' '))
353 value = unquote(nv[1].replace('+', ' '))
354 r.append((name, value))
355
356 return r
357
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000358
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000359test_input = """
360 http://a/b/c/d
361
362 g:h = <URL:g:h>
363 http:g = <URL:http://a/b/c/g>
364 http: = <URL:http://a/b/c/d>
365 g = <URL:http://a/b/c/g>
366 ./g = <URL:http://a/b/c/g>
367 g/ = <URL:http://a/b/c/g/>
368 /g = <URL:http://a/g>
369 //g = <URL:http://g>
370 ?y = <URL:http://a/b/c/d?y>
371 g?y = <URL:http://a/b/c/g?y>
372 g?y/./x = <URL:http://a/b/c/g?y/./x>
373 . = <URL:http://a/b/c/>
374 ./ = <URL:http://a/b/c/>
375 .. = <URL:http://a/b/>
376 ../ = <URL:http://a/b/>
377 ../g = <URL:http://a/b/g>
378 ../.. = <URL:http://a/>
379 ../../g = <URL:http://a/g>
380 ../../../g = <URL:http://a/../g>
381 ./../g = <URL:http://a/b/g>
382 ./g/. = <URL:http://a/b/c/g/>
383 /./g = <URL:http://a/./g>
384 g/./h = <URL:http://a/b/c/g/h>
385 g/../h = <URL:http://a/b/c/h>
386 http:g = <URL:http://a/b/c/g>
387 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000388 http:?y = <URL:http://a/b/c/d?y>
389 http:g?y = <URL:http://a/b/c/g?y>
390 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000391"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000392
393def test():
Tim Peterse1190062001-01-15 03:34:38 +0000394 import sys
395 base = ''
396 if sys.argv[1:]:
397 fn = sys.argv[1]
398 if fn == '-':
399 fp = sys.stdin
400 else:
401 fp = open(fn)
402 else:
Raymond Hettingera6172712004-12-31 19:15:26 +0000403 try:
404 from cStringIO import StringIO
405 except ImportError:
406 from StringIO import StringIO
407 fp = StringIO(test_input)
Georg Brandl1ea8cb42008-01-21 17:22:06 +0000408 for line in fp:
Tim Peterse1190062001-01-15 03:34:38 +0000409 words = line.split()
410 if not words:
411 continue
412 url = words[0]
413 parts = urlparse(url)
414 print '%-10s : %s' % (url, parts)
415 abs = urljoin(base, url)
416 if not base:
417 base = abs
418 wrapped = '<URL:%s>' % abs
419 print '%-10s = %s' % (url, wrapped)
420 if len(words) == 3 and words[1] == '=':
421 if wrapped != words[2]:
422 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000423
424if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000425 test()