blob: b747bc69b56ebbe34d1a4c6e75ef0bbf8c82c550 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +00006
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007# A classification of schemes ('' means apply by default)
8uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +00009 'https', 'shttp',
Fred Drake05565012000-04-14 14:01:34 +000010 'prospero', 'rtsp', 'rtspu', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000011uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossum974e32d1999-02-22 15:38:46 +000012 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000013 'https', 'shttp', 'snews',
Fred Drake05565012000-04-14 14:01:34 +000014 'prospero', 'rtsp', 'rtspu', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000015non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
Fred Drakebdd44a32000-06-20 18:32:16 +000016 'snews', 'sip',
Guido van Rossumb02092a1997-01-02 18:18:27 +000017 ]
18uses_params = ['ftp', 'hdl', 'prospero', 'http',
Fred Drakebdd44a32000-06-20 18:32:16 +000019 'https', 'shttp', 'rtsp', 'rtspu', 'sip',
Guido van Rossumb02092a1997-01-02 18:18:27 +000020 '']
21uses_query = ['http', 'wais',
22 'https', 'shttp',
Fred Drakebdd44a32000-06-20 18:32:16 +000023 'gopher', 'rtsp', 'rtspu', 'sip',
Guido van Rossumb02092a1997-01-02 18:18:27 +000024 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000025uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000026 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000027 'file', 'prospero', '']
28
29# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000030scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
32 '0123456789'
33 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034
Guido van Rossum74495401997-07-14 19:08:15 +000035MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000036_parse_cache = {}
37
38def clear_cache():
Guido van Rossuma25d7dd2000-04-10 17:02:46 +000039 """Clear the parse cache."""
40 global _parse_cache
41 _parse_cache = {}
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000042
43
Jeremy Hylton4722da61998-08-25 19:45:24 +000044def urlparse(url, scheme = '', allow_fragments = 1):
Guido van Rossume7b146f2000-02-04 15:28:42 +000045 """Parse a URL into 6 components:
46 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
47 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
48 Note that we don't break the components up in smaller bits
49 (e.g. netloc is a single string) and we don't expand % escapes."""
Jeremy Hylton4722da61998-08-25 19:45:24 +000050 key = url, scheme, allow_fragments
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000051 cached = _parse_cache.get(key, None)
52 if cached:
53 return cached
Guido van Rossum671dc201996-12-27 15:26:15 +000054 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
Guido van Rossuma25d7dd2000-04-10 17:02:46 +000055 clear_cache()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000056 netloc = path = params = query = fragment = ''
Guido van Rossumfad81f02000-12-19 16:48:13 +000057 i = url.find(':')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000058 if i > 0:
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000059 if url[:i] == 'http': # optimize the common case
Guido van Rossumfad81f02000-12-19 16:48:13 +000060 scheme = url[:i].lower()
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000061 url = url[i+1:]
62 if url[:2] == '//':
Guido van Rossumfad81f02000-12-19 16:48:13 +000063 i = url.find('/', 2)
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000064 if i < 0:
65 i = len(url)
66 netloc = url[2:i]
67 url = url[i:]
68 if allow_fragments:
Guido van Rossumfad81f02000-12-19 16:48:13 +000069 i = url.rfind('#')
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000070 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000071 fragment = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000072 url = url[:i]
Guido van Rossumfad81f02000-12-19 16:48:13 +000073 i = url.find('?')
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000074 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000075 query = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000076 url = url[:i]
Guido van Rossumfad81f02000-12-19 16:48:13 +000077 i = url.find(';')
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000078 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000079 params = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000080 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000081 tuple = scheme, netloc, url, params, query, fragment
82 _parse_cache[key] = tuple
83 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000084 for c in url[:i]:
85 if c not in scheme_chars:
86 break
87 else:
Guido van Rossumfad81f02000-12-19 16:48:13 +000088 scheme, url = url[:i].lower(), url[i+1:]
Guido van Rossum23cb2a81994-09-12 10:36:35 +000089 if scheme in uses_netloc:
90 if url[:2] == '//':
Guido van Rossumfad81f02000-12-19 16:48:13 +000091 i = url.find('/', 2)
Guido van Rossum23cb2a81994-09-12 10:36:35 +000092 if i < 0:
93 i = len(url)
94 netloc, url = url[2:i], url[i:]
Jeremy Hylton4722da61998-08-25 19:45:24 +000095 if allow_fragments and scheme in uses_fragment:
Guido van Rossumfad81f02000-12-19 16:48:13 +000096 i = url.rfind('#')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000097 if i >= 0:
98 url, fragment = url[:i], url[i+1:]
99 if scheme in uses_query:
Guido van Rossumfad81f02000-12-19 16:48:13 +0000100 i = url.find('?')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000101 if i >= 0:
102 url, query = url[:i], url[i+1:]
103 if scheme in uses_params:
Guido van Rossumfad81f02000-12-19 16:48:13 +0000104 i = url.find(';')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000105 if i >= 0:
106 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000107 tuple = scheme, netloc, url, params, query, fragment
108 _parse_cache[key] = tuple
109 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000110
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000111def urlunparse((scheme, netloc, url, params, query, fragment)):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000112 """Put a parsed URL back together again. This may result in a
113 slightly different, but equivalent URL, if the URL that was parsed
114 originally had redundant delimiters, e.g. a ? with an empty query
115 (the draft states that these are equivalent)."""
Guido van Rossumf3963b11999-03-18 15:10:44 +0000116 if netloc or (scheme in uses_netloc and url[:2] == '//'):
Fred Drake867952f2001-01-05 05:54:41 +0000117 if url and url[:1] != '/': url = '/' + url
Guido van Rossumf3963b11999-03-18 15:10:44 +0000118 url = '//' + (netloc or '') + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000119 if scheme:
120 url = scheme + ':' + url
121 if params:
122 url = url + ';' + params
123 if query:
124 url = url + '?' + query
125 if fragment:
126 url = url + '#' + fragment
127 return url
128
Jeremy Hylton4722da61998-08-25 19:45:24 +0000129def urljoin(base, url, allow_fragments = 1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000130 """Join a base URL and a possibly relative URL to form an absolute
131 interpretation of the latter."""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000132 if not base:
133 return url
Fred Drake867952f2001-01-05 05:54:41 +0000134 if not url:
135 return base
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000136 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000137 urlparse(base, '', allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000138 scheme, netloc, path, params, query, fragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000139 urlparse(url, bscheme, allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000140 if scheme != bscheme or scheme not in uses_relative:
Fred Drake867952f2001-01-05 05:54:41 +0000141 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000142 if scheme in uses_netloc:
143 if netloc:
144 return urlunparse((scheme, netloc, path,
145 params, query, fragment))
146 netloc = bnetloc
147 if path[:1] == '/':
148 return urlunparse((scheme, netloc, path,
149 params, query, fragment))
150 if not path:
Fred Drake867952f2001-01-05 05:54:41 +0000151 if not params:
152 params = bparams
153 if not query:
154 query = bquery
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000155 return urlunparse((scheme, netloc, bpath,
Fred Drake867952f2001-01-05 05:54:41 +0000156 params, query, fragment))
Guido van Rossumfad81f02000-12-19 16:48:13 +0000157 segments = bpath.split('/')[:-1] + path.split('/')
Guido van Rossuma25d7dd2000-04-10 17:02:46 +0000158 # XXX The stuff below is bogus in various ways...
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000159 if segments[-1] == '.':
160 segments[-1] = ''
161 while '.' in segments:
162 segments.remove('.')
163 while 1:
164 i = 1
165 n = len(segments) - 1
166 while i < n:
Fred Drake867952f2001-01-05 05:54:41 +0000167 if (segments[i] == '..'
168 and segments[i-1] not in ('', '..')):
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000169 del segments[i-1:i+1]
170 break
171 i = i+1
172 else:
173 break
Fred Drake867952f2001-01-05 05:54:41 +0000174 if segments == ['', '..']:
Guido van Rossume612be51997-12-03 22:38:56 +0000175 segments[-1] = ''
176 elif len(segments) >= 2 and segments[-1] == '..':
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000177 segments[-2:] = ['']
Guido van Rossumfad81f02000-12-19 16:48:13 +0000178 return urlunparse((scheme, netloc, '/'.join(segments),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000179 params, query, fragment))
180
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000181def urldefrag(url):
Guido van Rossuma25d7dd2000-04-10 17:02:46 +0000182 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000183
Guido van Rossuma25d7dd2000-04-10 17:02:46 +0000184 Returns a tuple of the defragmented URL and the fragment. If
185 the URL contained no fragments, the second element is the
186 empty string.
187 """
188 s, n, p, a, q, frag = urlparse(url)
189 defrag = urlunparse((s, n, p, a, q, ''))
190 return defrag, frag
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000191
192
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000193test_input = """
194 http://a/b/c/d
195
196 g:h = <URL:g:h>
197 http:g = <URL:http://a/b/c/g>
198 http: = <URL:http://a/b/c/d>
199 g = <URL:http://a/b/c/g>
200 ./g = <URL:http://a/b/c/g>
201 g/ = <URL:http://a/b/c/g/>
202 /g = <URL:http://a/g>
203 //g = <URL:http://g>
204 ?y = <URL:http://a/b/c/d?y>
205 g?y = <URL:http://a/b/c/g?y>
206 g?y/./x = <URL:http://a/b/c/g?y/./x>
207 . = <URL:http://a/b/c/>
208 ./ = <URL:http://a/b/c/>
209 .. = <URL:http://a/b/>
210 ../ = <URL:http://a/b/>
211 ../g = <URL:http://a/b/g>
212 ../.. = <URL:http://a/>
213 ../../g = <URL:http://a/g>
214 ../../../g = <URL:http://a/../g>
215 ./../g = <URL:http://a/b/g>
216 ./g/. = <URL:http://a/b/c/g/>
217 /./g = <URL:http://a/./g>
218 g/./h = <URL:http://a/b/c/g/h>
219 g/../h = <URL:http://a/b/c/h>
220 http:g = <URL:http://a/b/c/g>
221 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000222 http:?y = <URL:http://a/b/c/d?y>
223 http:g?y = <URL:http://a/b/c/g?y>
224 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000225"""
Guido van Rossumc08cc501998-12-21 18:24:09 +0000226# XXX The result for //g is actually http://g/; is this a problem?
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000227
228def test():
229 import sys
230 base = ''
231 if sys.argv[1:]:
232 fn = sys.argv[1]
233 if fn == '-':
234 fp = sys.stdin
235 else:
236 fp = open(fn)
237 else:
238 import StringIO
239 fp = StringIO.StringIO(test_input)
240 while 1:
241 line = fp.readline()
242 if not line: break
Guido van Rossumfad81f02000-12-19 16:48:13 +0000243 words = line.split()
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000244 if not words:
245 continue
246 url = words[0]
247 parts = urlparse(url)
248 print '%-10s : %s' % (url, parts)
249 abs = urljoin(base, url)
250 if not base:
251 base = abs
252 wrapped = '<URL:%s>' % abs
253 print '%-10s = %s' % (url, wrapped)
254 if len(words) == 3 and words[1] == '=':
255 if wrapped != words[2]:
256 print 'EXPECTED', words[2], '!!!!!!!!!!'
257
258if __name__ == '__main__':
259 test()