blob: 5e5d37d132c1231a87c7f770b4a5c24a5cc360b7 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +00006
Fred Drakef606e8d2002-10-16 21:21:39 +00007__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +00008 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +00009
Guido van Rossum23cb2a81994-09-12 10:36:35 +000010# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000011uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000012 'wais', 'file', 'https', 'shttp', 'mms',
13 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000014uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000015 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
16 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran5e95e762009-03-30 21:51:50 +000017 'svn', 'svn+ssh', 'sftp','nfs']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000018non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000019 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000020uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000021 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000022 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000023uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000024 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000025uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000026 'nntp', 'wais', 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000028
29# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000030scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
32 '0123456789'
33 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034
Guido van Rossum74495401997-07-14 19:08:15 +000035MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000036_parse_cache = {}
37
38def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000039 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000040 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000041
42
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000043class ResultMixin(object):
44 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000045
46 @property
47 def username(self):
48 netloc = self.netloc
49 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000050 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000051 if ":" in userinfo:
52 userinfo = userinfo.split(":", 1)[0]
53 return userinfo
54 return None
55
56 @property
57 def password(self):
58 netloc = self.netloc
59 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000060 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000061 if ":" in userinfo:
62 return userinfo.split(":", 1)[1]
63 return None
64
65 @property
66 def hostname(self):
67 netloc = self.netloc
68 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000069 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +000070 if ":" in netloc:
71 netloc = netloc.split(":", 1)[0]
72 return netloc.lower() or None
73
74 @property
75 def port(self):
76 netloc = self.netloc
77 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000078 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +000079 if ":" in netloc:
80 port = netloc.split(":", 1)[1]
81 return int(port, 10)
82 return None
83
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000084from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +000085
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000086class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +000087
88 __slots__ = ()
89
Fred Drakead5177c2006-04-01 22:14:43 +000090 def geturl(self):
91 return urlunsplit(self)
92
93
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000094class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +000095
96 __slots__ = ()
97
Fred Drakead5177c2006-04-01 22:14:43 +000098 def geturl(self):
99 return urlunparse(self)
100
101
102def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000103 """Parse a URL into 6 components:
104 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
105 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
106 Note that we don't break the components up in smaller bits
107 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000108 tuple = urlsplit(url, scheme, allow_fragments)
109 scheme, netloc, url, query, fragment = tuple
110 if scheme in uses_params and ';' in url:
111 url, params = _splitparams(url)
112 else:
113 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000114 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000115
116def _splitparams(url):
117 if '/' in url:
118 i = url.find(';', url.rfind('/'))
119 if i < 0:
120 return url, ''
121 else:
122 i = url.find(';')
123 return url[:i], url[i+1:]
124
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000125def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000126 delim = len(url) # position of end of domain part of url, default is end
127 for c in '/?#': # look for delimiters; the order is NOT important
128 wdelim = url.find(c, start) # find first of this delim
129 if wdelim >= 0: # if found
130 delim = min(delim, wdelim) # use earliest delim position
131 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000132
Fred Drakead5177c2006-04-01 22:14:43 +0000133def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000134 """Parse a URL into 5 components:
135 <scheme>://<netloc>/<path>?<query>#<fragment>
136 Return a 5-tuple: (scheme, netloc, path, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000139 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000140 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000141 cached = _parse_cache.get(key, None)
142 if cached:
143 return cached
144 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
145 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000146 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000147 i = url.find(':')
148 if i > 0:
149 if url[:i] == 'http': # optimize the common case
150 scheme = url[:i].lower()
151 url = url[i+1:]
152 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000153 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000154 if allow_fragments and '#' in url:
155 url, fragment = url.split('#', 1)
156 if '?' in url:
157 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000158 v = SplitResult(scheme, netloc, url, query, fragment)
159 _parse_cache[key] = v
160 return v
Tim Peterse1190062001-01-15 03:34:38 +0000161 for c in url[:i]:
162 if c not in scheme_chars:
163 break
164 else:
165 scheme, url = url[:i].lower(), url[i+1:]
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000166 if scheme in uses_netloc and url[:2] == '//':
167 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000168 if allow_fragments and scheme in uses_fragment and '#' in url:
169 url, fragment = url.split('#', 1)
170 if scheme in uses_query and '?' in url:
171 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000172 v = SplitResult(scheme, netloc, url, query, fragment)
173 _parse_cache[key] = v
174 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000175
Brett Cannon89318d82008-08-03 00:51:02 +0000176def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000177 """Put a parsed URL back together again. This may result in a
178 slightly different, but equivalent URL, if the URL that was parsed
179 originally had redundant delimiters, e.g. a ? with an empty query
180 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000181 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000182 if params:
183 url = "%s;%s" % (url, params)
184 return urlunsplit((scheme, netloc, url, query, fragment))
185
Brett Cannon89318d82008-08-03 00:51:02 +0000186def urlunsplit(data):
187 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000188 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000189 if url and url[:1] != '/': url = '/' + url
190 url = '//' + (netloc or '') + url
191 if scheme:
192 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000193 if query:
194 url = url + '?' + query
195 if fragment:
196 url = url + '#' + fragment
197 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000198
Fred Drakead5177c2006-04-01 22:14:43 +0000199def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000200 """Join a base URL and a possibly relative URL to form an absolute
201 interpretation of the latter."""
202 if not base:
203 return url
204 if not url:
205 return base
206 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
207 urlparse(base, '', allow_fragments)
208 scheme, netloc, path, params, query, fragment = \
209 urlparse(url, bscheme, allow_fragments)
210 if scheme != bscheme or scheme not in uses_relative:
211 return url
212 if scheme in uses_netloc:
213 if netloc:
214 return urlunparse((scheme, netloc, path,
215 params, query, fragment))
216 netloc = bnetloc
217 if path[:1] == '/':
218 return urlunparse((scheme, netloc, path,
219 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000220 if not path:
221 path = bpath
222 if not params:
223 params = bparams
224 else:
225 path = path[:-1]
226 return urlunparse((scheme, netloc, path,
227 params, query, fragment))
228 if not query:
229 query = bquery
230 return urlunparse((scheme, netloc, path,
231 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000232 segments = bpath.split('/')[:-1] + path.split('/')
233 # XXX The stuff below is bogus in various ways...
234 if segments[-1] == '.':
235 segments[-1] = ''
236 while '.' in segments:
237 segments.remove('.')
238 while 1:
239 i = 1
240 n = len(segments) - 1
241 while i < n:
242 if (segments[i] == '..'
243 and segments[i-1] not in ('', '..')):
244 del segments[i-1:i+1]
245 break
246 i = i+1
247 else:
248 break
249 if segments == ['', '..']:
250 segments[-1] = ''
251 elif len(segments) >= 2 and segments[-1] == '..':
252 segments[-2:] = ['']
253 return urlunparse((scheme, netloc, '/'.join(segments),
254 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000255
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000256def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000257 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000258
Tim Peterse1190062001-01-15 03:34:38 +0000259 Returns a tuple of the defragmented URL and the fragment. If
260 the URL contained no fragments, the second element is the
261 empty string.
262 """
Fred Drake5751a222001-11-16 02:52:57 +0000263 if '#' in url:
264 s, n, p, a, q, frag = urlparse(url)
265 defrag = urlunparse((s, n, p, a, q, ''))
266 return defrag, frag
267 else:
268 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000269
Facundo Batistac585df92008-09-03 22:35:50 +0000270# unquote method for parse_qs and parse_qsl
271# Cannot use directly from urllib as it would create circular reference.
272# urllib uses urlparse methods ( urljoin)
273
274_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
275_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
276
277def unquote(s):
278 """unquote('abc%20def') -> 'abc def'."""
279 res = s.split('%')
280 for i in xrange(1, len(res)):
281 item = res[i]
282 try:
283 res[i] = _hextochr[item[:2]] + item[2:]
284 except KeyError:
285 res[i] = '%' + item
286 except UnicodeDecodeError:
287 res[i] = unichr(int(item[:2], 16)) + item[2:]
288 return "".join(res)
289
290def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
291 """Parse a query given as a string argument.
292
293 Arguments:
294
295 qs: URL-encoded query string to be parsed
296
297 keep_blank_values: flag indicating whether blank values in
298 URL encoded queries should be treated as blank strings.
299 A true value indicates that blanks should be retained as
300 blank strings. The default false value indicates that
301 blank values are to be ignored and treated as if they were
302 not included.
303
304 strict_parsing: flag indicating what to do with parsing errors.
305 If false (the default), errors are silently ignored.
306 If true, errors raise a ValueError exception.
307 """
308 dict = {}
309 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
310 if name in dict:
311 dict[name].append(value)
312 else:
313 dict[name] = [value]
314 return dict
315
316def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
317 """Parse a query given as a string argument.
318
319 Arguments:
320
321 qs: URL-encoded query string to be parsed
322
323 keep_blank_values: flag indicating whether blank values in
324 URL encoded queries should be treated as blank strings. A
325 true value indicates that blanks should be retained as blank
326 strings. The default false value indicates that blank values
327 are to be ignored and treated as if they were not included.
328
329 strict_parsing: flag indicating what to do with parsing errors. If
330 false (the default), errors are silently ignored. If true,
331 errors raise a ValueError exception.
332
333 Returns a list, as G-d intended.
334 """
335 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
336 r = []
337 for name_value in pairs:
338 if not name_value and not strict_parsing:
339 continue
340 nv = name_value.split('=', 1)
341 if len(nv) != 2:
342 if strict_parsing:
343 raise ValueError, "bad query field: %r" % (name_value,)
344 # Handle case of a control-name with no equal sign
345 if keep_blank_values:
346 nv.append('')
347 else:
348 continue
349 if len(nv[1]) or keep_blank_values:
350 name = unquote(nv[0].replace('+', ' '))
351 value = unquote(nv[1].replace('+', ' '))
352 r.append((name, value))
353
354 return r
355
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000356
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000357test_input = """
358 http://a/b/c/d
359
360 g:h = <URL:g:h>
361 http:g = <URL:http://a/b/c/g>
362 http: = <URL:http://a/b/c/d>
363 g = <URL:http://a/b/c/g>
364 ./g = <URL:http://a/b/c/g>
365 g/ = <URL:http://a/b/c/g/>
366 /g = <URL:http://a/g>
367 //g = <URL:http://g>
368 ?y = <URL:http://a/b/c/d?y>
369 g?y = <URL:http://a/b/c/g?y>
370 g?y/./x = <URL:http://a/b/c/g?y/./x>
371 . = <URL:http://a/b/c/>
372 ./ = <URL:http://a/b/c/>
373 .. = <URL:http://a/b/>
374 ../ = <URL:http://a/b/>
375 ../g = <URL:http://a/b/g>
376 ../.. = <URL:http://a/>
377 ../../g = <URL:http://a/g>
378 ../../../g = <URL:http://a/../g>
379 ./../g = <URL:http://a/b/g>
380 ./g/. = <URL:http://a/b/c/g/>
381 /./g = <URL:http://a/./g>
382 g/./h = <URL:http://a/b/c/g/h>
383 g/../h = <URL:http://a/b/c/h>
384 http:g = <URL:http://a/b/c/g>
385 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000386 http:?y = <URL:http://a/b/c/d?y>
387 http:g?y = <URL:http://a/b/c/g?y>
388 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000389"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000390
391def test():
Tim Peterse1190062001-01-15 03:34:38 +0000392 import sys
393 base = ''
394 if sys.argv[1:]:
395 fn = sys.argv[1]
396 if fn == '-':
397 fp = sys.stdin
398 else:
399 fp = open(fn)
400 else:
Raymond Hettingera6172712004-12-31 19:15:26 +0000401 try:
402 from cStringIO import StringIO
403 except ImportError:
404 from StringIO import StringIO
405 fp = StringIO(test_input)
Georg Brandl1ea8cb42008-01-21 17:22:06 +0000406 for line in fp:
Tim Peterse1190062001-01-15 03:34:38 +0000407 words = line.split()
408 if not words:
409 continue
410 url = words[0]
411 parts = urlparse(url)
412 print '%-10s : %s' % (url, parts)
413 abs = urljoin(base, url)
414 if not base:
415 base = abs
416 wrapped = '<URL:%s>' % abs
417 print '%-10s = %s' % (url, wrapped)
418 if len(words) == 3 and words[1] == '=':
419 if wrapped != words[2]:
420 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000421
422if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000423 test()