blob: 1d065d3986e8d445c008697bcec67f40c1c4fa5a [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +00006
Fred Drakef606e8d2002-10-16 21:21:39 +00007__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac585df92008-09-03 22:35:50 +00008 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
Skip Montanaro40fc1602001-03-01 04:27:19 +00009
Guido van Rossum23cb2a81994-09-12 10:36:35 +000010# A classification of schemes ('' means apply by default)
Raymond Hettinger156c49a2004-05-07 05:50:35 +000011uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
Georg Brandl89f35ac2006-01-20 17:24:23 +000012 'wais', 'file', 'https', 'shttp', 'mms',
13 'prospero', 'rtsp', 'rtspu', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000014uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
Georg Brandl89f35ac2006-01-20 17:24:23 +000015 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
16 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran5e95e762009-03-30 21:51:50 +000017 'svn', 'svn+ssh', 'sftp','nfs']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000018non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
Fred Drake23fd3d42006-04-01 06:11:07 +000019 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000020uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
Fred Drake23fd3d42006-04-01 06:11:07 +000021 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Georg Brandl89f35ac2006-01-20 17:24:23 +000022 'mms', '', 'sftp']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000023uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
Fred Drake23fd3d42006-04-01 06:11:07 +000024 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
Raymond Hettinger156c49a2004-05-07 05:50:35 +000025uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
Georg Brandl89f35ac2006-01-20 17:24:23 +000026 'nntp', 'wais', 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000028
29# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000030scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
32 '0123456789'
33 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034
Guido van Rossum74495401997-07-14 19:08:15 +000035MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000036_parse_cache = {}
37
38def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000039 """Clear the parse cache."""
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000040 _parse_cache.clear()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000041
42
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000043class ResultMixin(object):
44 """Shared methods for the parsed result objects."""
Fred Drakead5177c2006-04-01 22:14:43 +000045
46 @property
47 def username(self):
48 netloc = self.netloc
49 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000050 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000051 if ":" in userinfo:
52 userinfo = userinfo.split(":", 1)[0]
53 return userinfo
54 return None
55
56 @property
57 def password(self):
58 netloc = self.netloc
59 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000060 userinfo = netloc.rsplit("@", 1)[0]
Fred Drakead5177c2006-04-01 22:14:43 +000061 if ":" in userinfo:
62 return userinfo.split(":", 1)[1]
63 return None
64
65 @property
66 def hostname(self):
67 netloc = self.netloc
68 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000069 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +000070 if ":" in netloc:
71 netloc = netloc.split(":", 1)[0]
72 return netloc.lower() or None
73
74 @property
75 def port(self):
76 netloc = self.netloc
77 if "@" in netloc:
Guido van Rossumced4eb02008-01-05 01:21:57 +000078 netloc = netloc.rsplit("@", 1)[1]
Fred Drakead5177c2006-04-01 22:14:43 +000079 if ":" in netloc:
80 port = netloc.split(":", 1)[1]
81 return int(port, 10)
82 return None
83
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000084from collections import namedtuple
Fred Drakead5177c2006-04-01 22:14:43 +000085
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000086class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +000087
88 __slots__ = ()
89
Fred Drakead5177c2006-04-01 22:14:43 +000090 def geturl(self):
91 return urlunsplit(self)
92
93
Raymond Hettinger0f6a6562008-01-11 18:04:55 +000094class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
Fred Drakead5177c2006-04-01 22:14:43 +000095
96 __slots__ = ()
97
Fred Drakead5177c2006-04-01 22:14:43 +000098 def geturl(self):
99 return urlunparse(self)
100
101
102def urlparse(url, scheme='', allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000103 """Parse a URL into 6 components:
104 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
105 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
106 Note that we don't break the components up in smaller bits
107 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drake5751a222001-11-16 02:52:57 +0000108 tuple = urlsplit(url, scheme, allow_fragments)
109 scheme, netloc, url, query, fragment = tuple
110 if scheme in uses_params and ';' in url:
111 url, params = _splitparams(url)
112 else:
113 params = ''
Fred Drakead5177c2006-04-01 22:14:43 +0000114 return ParseResult(scheme, netloc, url, params, query, fragment)
Fred Drake5751a222001-11-16 02:52:57 +0000115
116def _splitparams(url):
117 if '/' in url:
118 i = url.find(';', url.rfind('/'))
119 if i < 0:
120 return url, ''
121 else:
122 i = url.find(';')
123 return url[:i], url[i+1:]
124
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000125def _splitnetloc(url, start=0):
Guido van Rossumc6a04c22008-01-05 22:19:06 +0000126 delim = len(url) # position of end of domain part of url, default is end
127 for c in '/?#': # look for delimiters; the order is NOT important
128 wdelim = url.find(c, start) # find first of this delim
129 if wdelim >= 0: # if found
130 delim = min(delim, wdelim) # use earliest delim position
131 return url[start:delim], url[delim:] # return (domain, rest)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000132
Fred Drakead5177c2006-04-01 22:14:43 +0000133def urlsplit(url, scheme='', allow_fragments=True):
Fred Drake5751a222001-11-16 02:52:57 +0000134 """Parse a URL into 5 components:
135 <scheme>://<netloc>/<path>?<query>#<fragment>
136 Return a 5-tuple: (scheme, netloc, path, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
Fred Drakead5177c2006-04-01 22:14:43 +0000139 allow_fragments = bool(allow_fragments)
Alexandre Vassalotti2f9ca292007-12-13 17:58:23 +0000140 key = url, scheme, allow_fragments, type(url), type(scheme)
Tim Peterse1190062001-01-15 03:34:38 +0000141 cached = _parse_cache.get(key, None)
142 if cached:
143 return cached
144 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
145 clear_cache()
Fred Drake5751a222001-11-16 02:52:57 +0000146 netloc = query = fragment = ''
Tim Peterse1190062001-01-15 03:34:38 +0000147 i = url.find(':')
148 if i > 0:
149 if url[:i] == 'http': # optimize the common case
150 scheme = url[:i].lower()
151 url = url[i+1:]
152 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000153 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000154 if allow_fragments and '#' in url:
155 url, fragment = url.split('#', 1)
156 if '?' in url:
157 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000158 v = SplitResult(scheme, netloc, url, query, fragment)
159 _parse_cache[key] = v
160 return v
Tim Peterse1190062001-01-15 03:34:38 +0000161 for c in url[:i]:
162 if c not in scheme_chars:
163 break
164 else:
165 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran4e78de82010-02-19 07:32:48 +0000166
167 if url[:2] == '//':
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +0000168 netloc, url = _splitnetloc(url, 2)
Fred Drake5751a222001-11-16 02:52:57 +0000169 if allow_fragments and scheme in uses_fragment and '#' in url:
170 url, fragment = url.split('#', 1)
171 if scheme in uses_query and '?' in url:
172 url, query = url.split('?', 1)
Fred Drakead5177c2006-04-01 22:14:43 +0000173 v = SplitResult(scheme, netloc, url, query, fragment)
174 _parse_cache[key] = v
175 return v
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000176
Brett Cannon89318d82008-08-03 00:51:02 +0000177def urlunparse(data):
Tim Peterse1190062001-01-15 03:34:38 +0000178 """Put a parsed URL back together again. This may result in a
179 slightly different, but equivalent URL, if the URL that was parsed
180 originally had redundant delimiters, e.g. a ? with an empty query
181 (the draft states that these are equivalent)."""
Brett Cannon89318d82008-08-03 00:51:02 +0000182 scheme, netloc, url, params, query, fragment = data
Fred Drake5751a222001-11-16 02:52:57 +0000183 if params:
184 url = "%s;%s" % (url, params)
185 return urlunsplit((scheme, netloc, url, query, fragment))
186
Brett Cannon89318d82008-08-03 00:51:02 +0000187def urlunsplit(data):
188 scheme, netloc, url, query, fragment = data
Guido van Rossumbbc05682002-10-14 19:59:54 +0000189 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
Tim Peterse1190062001-01-15 03:34:38 +0000190 if url and url[:1] != '/': url = '/' + url
191 url = '//' + (netloc or '') + url
192 if scheme:
193 url = scheme + ':' + url
Tim Peterse1190062001-01-15 03:34:38 +0000194 if query:
195 url = url + '?' + query
196 if fragment:
197 url = url + '#' + fragment
198 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000199
Fred Drakead5177c2006-04-01 22:14:43 +0000200def urljoin(base, url, allow_fragments=True):
Tim Peterse1190062001-01-15 03:34:38 +0000201 """Join a base URL and a possibly relative URL to form an absolute
202 interpretation of the latter."""
203 if not base:
204 return url
205 if not url:
206 return base
207 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
208 urlparse(base, '', allow_fragments)
209 scheme, netloc, path, params, query, fragment = \
210 urlparse(url, bscheme, allow_fragments)
211 if scheme != bscheme or scheme not in uses_relative:
212 return url
213 if scheme in uses_netloc:
214 if netloc:
215 return urlunparse((scheme, netloc, path,
216 params, query, fragment))
217 netloc = bnetloc
218 if path[:1] == '/':
219 return urlunparse((scheme, netloc, path,
220 params, query, fragment))
Facundo Batista67d19812008-08-14 16:51:00 +0000221 if not path:
222 path = bpath
223 if not params:
224 params = bparams
225 else:
226 path = path[:-1]
227 return urlunparse((scheme, netloc, path,
228 params, query, fragment))
229 if not query:
230 query = bquery
231 return urlunparse((scheme, netloc, path,
232 params, query, fragment))
Tim Peterse1190062001-01-15 03:34:38 +0000233 segments = bpath.split('/')[:-1] + path.split('/')
234 # XXX The stuff below is bogus in various ways...
235 if segments[-1] == '.':
236 segments[-1] = ''
237 while '.' in segments:
238 segments.remove('.')
239 while 1:
240 i = 1
241 n = len(segments) - 1
242 while i < n:
243 if (segments[i] == '..'
244 and segments[i-1] not in ('', '..')):
245 del segments[i-1:i+1]
246 break
247 i = i+1
248 else:
249 break
250 if segments == ['', '..']:
251 segments[-1] = ''
252 elif len(segments) >= 2 and segments[-1] == '..':
253 segments[-2:] = ['']
254 return urlunparse((scheme, netloc, '/'.join(segments),
255 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000256
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000257def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000258 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000259
Tim Peterse1190062001-01-15 03:34:38 +0000260 Returns a tuple of the defragmented URL and the fragment. If
261 the URL contained no fragments, the second element is the
262 empty string.
263 """
Fred Drake5751a222001-11-16 02:52:57 +0000264 if '#' in url:
265 s, n, p, a, q, frag = urlparse(url)
266 defrag = urlunparse((s, n, p, a, q, ''))
267 return defrag, frag
268 else:
269 return url, ''
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000270
Facundo Batistac585df92008-09-03 22:35:50 +0000271# unquote method for parse_qs and parse_qsl
272# Cannot use directly from urllib as it would create circular reference.
273# urllib uses urlparse methods ( urljoin)
274
275_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
276_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
277
278def unquote(s):
279 """unquote('abc%20def') -> 'abc def'."""
280 res = s.split('%')
281 for i in xrange(1, len(res)):
282 item = res[i]
283 try:
284 res[i] = _hextochr[item[:2]] + item[2:]
285 except KeyError:
286 res[i] = '%' + item
287 except UnicodeDecodeError:
288 res[i] = unichr(int(item[:2], 16)) + item[2:]
289 return "".join(res)
290
291def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
292 """Parse a query given as a string argument.
293
294 Arguments:
295
296 qs: URL-encoded query string to be parsed
297
298 keep_blank_values: flag indicating whether blank values in
299 URL encoded queries should be treated as blank strings.
300 A true value indicates that blanks should be retained as
301 blank strings. The default false value indicates that
302 blank values are to be ignored and treated as if they were
303 not included.
304
305 strict_parsing: flag indicating what to do with parsing errors.
306 If false (the default), errors are silently ignored.
307 If true, errors raise a ValueError exception.
308 """
309 dict = {}
310 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
311 if name in dict:
312 dict[name].append(value)
313 else:
314 dict[name] = [value]
315 return dict
316
317def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
318 """Parse a query given as a string argument.
319
320 Arguments:
321
322 qs: URL-encoded query string to be parsed
323
324 keep_blank_values: flag indicating whether blank values in
325 URL encoded queries should be treated as blank strings. A
326 true value indicates that blanks should be retained as blank
327 strings. The default false value indicates that blank values
328 are to be ignored and treated as if they were not included.
329
330 strict_parsing: flag indicating what to do with parsing errors. If
331 false (the default), errors are silently ignored. If true,
332 errors raise a ValueError exception.
333
334 Returns a list, as G-d intended.
335 """
336 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
337 r = []
338 for name_value in pairs:
339 if not name_value and not strict_parsing:
340 continue
341 nv = name_value.split('=', 1)
342 if len(nv) != 2:
343 if strict_parsing:
344 raise ValueError, "bad query field: %r" % (name_value,)
345 # Handle case of a control-name with no equal sign
346 if keep_blank_values:
347 nv.append('')
348 else:
349 continue
350 if len(nv[1]) or keep_blank_values:
351 name = unquote(nv[0].replace('+', ' '))
352 value = unquote(nv[1].replace('+', ' '))
353 r.append((name, value))
354
355 return r
356
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000357
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000358test_input = """
359 http://a/b/c/d
360
361 g:h = <URL:g:h>
362 http:g = <URL:http://a/b/c/g>
363 http: = <URL:http://a/b/c/d>
364 g = <URL:http://a/b/c/g>
365 ./g = <URL:http://a/b/c/g>
366 g/ = <URL:http://a/b/c/g/>
367 /g = <URL:http://a/g>
368 //g = <URL:http://g>
369 ?y = <URL:http://a/b/c/d?y>
370 g?y = <URL:http://a/b/c/g?y>
371 g?y/./x = <URL:http://a/b/c/g?y/./x>
372 . = <URL:http://a/b/c/>
373 ./ = <URL:http://a/b/c/>
374 .. = <URL:http://a/b/>
375 ../ = <URL:http://a/b/>
376 ../g = <URL:http://a/b/g>
377 ../.. = <URL:http://a/>
378 ../../g = <URL:http://a/g>
379 ../../../g = <URL:http://a/../g>
380 ./../g = <URL:http://a/b/g>
381 ./g/. = <URL:http://a/b/c/g/>
382 /./g = <URL:http://a/./g>
383 g/./h = <URL:http://a/b/c/g/h>
384 g/../h = <URL:http://a/b/c/h>
385 http:g = <URL:http://a/b/c/g>
386 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000387 http:?y = <URL:http://a/b/c/d?y>
388 http:g?y = <URL:http://a/b/c/g?y>
389 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000390"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000391
392def test():
Tim Peterse1190062001-01-15 03:34:38 +0000393 import sys
394 base = ''
395 if sys.argv[1:]:
396 fn = sys.argv[1]
397 if fn == '-':
398 fp = sys.stdin
399 else:
400 fp = open(fn)
401 else:
Raymond Hettingera6172712004-12-31 19:15:26 +0000402 try:
403 from cStringIO import StringIO
404 except ImportError:
405 from StringIO import StringIO
406 fp = StringIO(test_input)
Georg Brandl1ea8cb42008-01-21 17:22:06 +0000407 for line in fp:
Tim Peterse1190062001-01-15 03:34:38 +0000408 words = line.split()
409 if not words:
410 continue
411 url = words[0]
412 parts = urlparse(url)
413 print '%-10s : %s' % (url, parts)
414 abs = urljoin(base, url)
415 if not base:
416 base = abs
417 wrapped = '<URL:%s>' % abs
418 print '%-10s = %s' % (url, wrapped)
419 if len(words) == 3 and words[1] == '=':
420 if wrapped != words[2]:
421 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000422
423if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000424 test()