blob: 929315e9630018183fb4ee3cadf6eff76af6b1c2 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +00006
7# Standard/builtin Python modules
8import string
Guido van Rossuma25d7dd2000-04-10 17:02:46 +00009from string import join, split, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +000010
11# A classification of schemes ('' means apply by default)
12uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000013 'https', 'shttp',
Fred Drake05565012000-04-14 14:01:34 +000014 'prospero', 'rtsp', 'rtspu', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000015uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossum974e32d1999-02-22 15:38:46 +000016 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000017 'https', 'shttp', 'snews',
Fred Drake05565012000-04-14 14:01:34 +000018 'prospero', 'rtsp', 'rtspu', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000019non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
Fred Drakebdd44a32000-06-20 18:32:16 +000020 'snews', 'sip',
Guido van Rossumb02092a1997-01-02 18:18:27 +000021 ]
22uses_params = ['ftp', 'hdl', 'prospero', 'http',
Fred Drakebdd44a32000-06-20 18:32:16 +000023 'https', 'shttp', 'rtsp', 'rtspu', 'sip',
Guido van Rossumb02092a1997-01-02 18:18:27 +000024 '']
25uses_query = ['http', 'wais',
26 'https', 'shttp',
Fred Drakebdd44a32000-06-20 18:32:16 +000027 'gopher', 'rtsp', 'rtspu', 'sip',
Guido van Rossumb02092a1997-01-02 18:18:27 +000028 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000029uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000030 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000031 'file', 'prospero', '']
32
33# Characters valid in scheme names
34scheme_chars = string.letters + string.digits + '+-.'
35
Guido van Rossum74495401997-07-14 19:08:15 +000036MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000037_parse_cache = {}
38
39def clear_cache():
Guido van Rossuma25d7dd2000-04-10 17:02:46 +000040 """Clear the parse cache."""
41 global _parse_cache
42 _parse_cache = {}
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000043
44
Jeremy Hylton4722da61998-08-25 19:45:24 +000045def urlparse(url, scheme = '', allow_fragments = 1):
Guido van Rossume7b146f2000-02-04 15:28:42 +000046 """Parse a URL into 6 components:
47 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
48 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
49 Note that we don't break the components up in smaller bits
50 (e.g. netloc is a single string) and we don't expand % escapes."""
Jeremy Hylton4722da61998-08-25 19:45:24 +000051 key = url, scheme, allow_fragments
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000052 cached = _parse_cache.get(key, None)
53 if cached:
54 return cached
Guido van Rossum671dc201996-12-27 15:26:15 +000055 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
Guido van Rossuma25d7dd2000-04-10 17:02:46 +000056 clear_cache()
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000057 find = string.find
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000058 netloc = path = params = query = fragment = ''
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000059 i = find(url, ':')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000060 if i > 0:
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000061 if url[:i] == 'http': # optimize the common case
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000062 scheme = string.lower(url[:i])
63 url = url[i+1:]
64 if url[:2] == '//':
65 i = find(url, '/', 2)
66 if i < 0:
67 i = len(url)
68 netloc = url[2:i]
69 url = url[i:]
70 if allow_fragments:
71 i = string.rfind(url, '#')
72 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000073 fragment = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000074 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000075 i = find(url, '?')
76 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000077 query = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000078 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000079 i = find(url, ';')
80 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000081 params = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000082 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000083 tuple = scheme, netloc, url, params, query, fragment
84 _parse_cache[key] = tuple
85 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000086 for c in url[:i]:
87 if c not in scheme_chars:
88 break
89 else:
90 scheme, url = string.lower(url[:i]), url[i+1:]
91 if scheme in uses_netloc:
92 if url[:2] == '//':
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000093 i = find(url, '/', 2)
Guido van Rossum23cb2a81994-09-12 10:36:35 +000094 if i < 0:
95 i = len(url)
96 netloc, url = url[2:i], url[i:]
Jeremy Hylton4722da61998-08-25 19:45:24 +000097 if allow_fragments and scheme in uses_fragment:
Guido van Rossum23cb2a81994-09-12 10:36:35 +000098 i = string.rfind(url, '#')
99 if i >= 0:
100 url, fragment = url[:i], url[i+1:]
101 if scheme in uses_query:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +0000102 i = find(url, '?')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000103 if i >= 0:
104 url, query = url[:i], url[i+1:]
105 if scheme in uses_params:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +0000106 i = find(url, ';')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000107 if i >= 0:
108 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000109 tuple = scheme, netloc, url, params, query, fragment
110 _parse_cache[key] = tuple
111 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000112
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000113def urlunparse((scheme, netloc, url, params, query, fragment)):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000114 """Put a parsed URL back together again. This may result in a
115 slightly different, but equivalent URL, if the URL that was parsed
116 originally had redundant delimiters, e.g. a ? with an empty query
117 (the draft states that these are equivalent)."""
Guido van Rossumf3963b11999-03-18 15:10:44 +0000118 if netloc or (scheme in uses_netloc and url[:2] == '//'):
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000119 if url[:1] != '/': url = '/' + url
Guido van Rossumf3963b11999-03-18 15:10:44 +0000120 url = '//' + (netloc or '') + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000121 if scheme:
122 url = scheme + ':' + url
123 if params:
124 url = url + ';' + params
125 if query:
126 url = url + '?' + query
127 if fragment:
128 url = url + '#' + fragment
129 return url
130
Jeremy Hylton4722da61998-08-25 19:45:24 +0000131def urljoin(base, url, allow_fragments = 1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000132 """Join a base URL and a possibly relative URL to form an absolute
133 interpretation of the latter."""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000134 if not base:
135 return url
136 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000137 urlparse(base, '', allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000138 scheme, netloc, path, params, query, fragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000139 urlparse(url, bscheme, allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000140 if scheme != bscheme or scheme not in uses_relative:
141 return urlunparse((scheme, netloc, path,
142 params, query, fragment))
143 if scheme in uses_netloc:
144 if netloc:
145 return urlunparse((scheme, netloc, path,
146 params, query, fragment))
147 netloc = bnetloc
148 if path[:1] == '/':
149 return urlunparse((scheme, netloc, path,
150 params, query, fragment))
151 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000152 return urlunparse((scheme, netloc, bpath,
153 params, query or bquery, fragment))
Guido van Rossuma25d7dd2000-04-10 17:02:46 +0000154 segments = split(bpath, '/')[:-1] + split(path, '/')
155 # XXX The stuff below is bogus in various ways...
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000156 if segments[-1] == '.':
157 segments[-1] = ''
158 while '.' in segments:
159 segments.remove('.')
160 while 1:
161 i = 1
162 n = len(segments) - 1
163 while i < n:
164 if segments[i] == '..' and segments[i-1]:
165 del segments[i-1:i+1]
166 break
167 i = i+1
168 else:
169 break
Guido van Rossume612be51997-12-03 22:38:56 +0000170 if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
171 segments[-1] = ''
172 elif len(segments) >= 2 and segments[-1] == '..':
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000173 segments[-2:] = ['']
Guido van Rossuma25d7dd2000-04-10 17:02:46 +0000174 return urlunparse((scheme, netloc, join(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000175 params, query, fragment))
176
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000177def urldefrag(url):
Guido van Rossuma25d7dd2000-04-10 17:02:46 +0000178 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000179
Guido van Rossuma25d7dd2000-04-10 17:02:46 +0000180 Returns a tuple of the defragmented URL and the fragment. If
181 the URL contained no fragments, the second element is the
182 empty string.
183 """
184 s, n, p, a, q, frag = urlparse(url)
185 defrag = urlunparse((s, n, p, a, q, ''))
186 return defrag, frag
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000187
188
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000189test_input = """
190 http://a/b/c/d
191
192 g:h = <URL:g:h>
193 http:g = <URL:http://a/b/c/g>
194 http: = <URL:http://a/b/c/d>
195 g = <URL:http://a/b/c/g>
196 ./g = <URL:http://a/b/c/g>
197 g/ = <URL:http://a/b/c/g/>
198 /g = <URL:http://a/g>
199 //g = <URL:http://g>
200 ?y = <URL:http://a/b/c/d?y>
201 g?y = <URL:http://a/b/c/g?y>
202 g?y/./x = <URL:http://a/b/c/g?y/./x>
203 . = <URL:http://a/b/c/>
204 ./ = <URL:http://a/b/c/>
205 .. = <URL:http://a/b/>
206 ../ = <URL:http://a/b/>
207 ../g = <URL:http://a/b/g>
208 ../.. = <URL:http://a/>
209 ../../g = <URL:http://a/g>
210 ../../../g = <URL:http://a/../g>
211 ./../g = <URL:http://a/b/g>
212 ./g/. = <URL:http://a/b/c/g/>
213 /./g = <URL:http://a/./g>
214 g/./h = <URL:http://a/b/c/g/h>
215 g/../h = <URL:http://a/b/c/h>
216 http:g = <URL:http://a/b/c/g>
217 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000218 http:?y = <URL:http://a/b/c/d?y>
219 http:g?y = <URL:http://a/b/c/g?y>
220 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000221"""
Guido van Rossumc08cc501998-12-21 18:24:09 +0000222# XXX The result for //g is actually http://g/; is this a problem?
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000223
224def test():
225 import sys
226 base = ''
227 if sys.argv[1:]:
228 fn = sys.argv[1]
229 if fn == '-':
230 fp = sys.stdin
231 else:
232 fp = open(fn)
233 else:
234 import StringIO
235 fp = StringIO.StringIO(test_input)
236 while 1:
237 line = fp.readline()
238 if not line: break
239 words = string.split(line)
240 if not words:
241 continue
242 url = words[0]
243 parts = urlparse(url)
244 print '%-10s : %s' % (url, parts)
245 abs = urljoin(base, url)
246 if not base:
247 base = abs
248 wrapped = '<URL:%s>' % abs
249 print '%-10s = %s' % (url, wrapped)
250 if len(words) == 3 and words[1] == '=':
251 if wrapped != words[2]:
252 print 'EXPECTED', words[2], '!!!!!!!!!!'
253
254if __name__ == '__main__':
255 test()