blob: af111f2e3791dd433cc95b368c711d6bff535a71 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +00006
7# Standard/builtin Python modules
8import string
Guido van Rossum4f136691999-05-03 18:16:23 +00009from string import joinfields, splitfields, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +000010
11# A classification of schemes ('' means apply by default)
12uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000013 'https', 'shttp',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000014 'prospero', '']
15uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossum974e32d1999-02-22 15:38:46 +000016 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000017 'https', 'shttp', 'snews',
Guido van Rossum1a16c861995-08-10 19:45:41 +000018 'prospero', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000019non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
20 'snews',
21 ]
22uses_params = ['ftp', 'hdl', 'prospero', 'http',
23 'https', 'shttp',
24 '']
25uses_query = ['http', 'wais',
26 'https', 'shttp',
Guido van Rossumf7edadb1998-01-19 22:27:21 +000027 'gopher',
Guido van Rossumb02092a1997-01-02 18:18:27 +000028 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000029uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000030 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000031 'file', 'prospero', '']
32
33# Characters valid in scheme names
34scheme_chars = string.letters + string.digits + '+-.'
35
Guido van Rossum74495401997-07-14 19:08:15 +000036MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000037_parse_cache = {}
38
39def clear_cache():
Guido van Rossum671dc201996-12-27 15:26:15 +000040 """Clear the parse cache."""
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000041 global _parse_cache
42 _parse_cache = {}
43
44
Jeremy Hylton4722da61998-08-25 19:45:24 +000045def urlparse(url, scheme = '', allow_fragments = 1):
Guido van Rossume7b146f2000-02-04 15:28:42 +000046 """Parse a URL into 6 components:
47 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
48 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
49 Note that we don't break the components up in smaller bits
50 (e.g. netloc is a single string) and we don't expand % escapes."""
Jeremy Hylton4722da61998-08-25 19:45:24 +000051 key = url, scheme, allow_fragments
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000052 cached = _parse_cache.get(key, None)
53 if cached:
54 return cached
Guido van Rossum671dc201996-12-27 15:26:15 +000055 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
56 clear_cache()
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000057 find = string.find
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000058 netloc = path = params = query = fragment = ''
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000059 i = find(url, ':')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000060 if i > 0:
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000061 if url[:i] == 'http': # optimize the common case
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000062 scheme = string.lower(url[:i])
63 url = url[i+1:]
64 if url[:2] == '//':
65 i = find(url, '/', 2)
66 if i < 0:
67 i = len(url)
68 netloc = url[2:i]
69 url = url[i:]
70 if allow_fragments:
71 i = string.rfind(url, '#')
72 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000073 fragment = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000074 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000075 i = find(url, '?')
76 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000077 query = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000078 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000079 i = find(url, ';')
80 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000081 params = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000082 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000083 tuple = scheme, netloc, url, params, query, fragment
84 _parse_cache[key] = tuple
85 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000086 for c in url[:i]:
87 if c not in scheme_chars:
88 break
89 else:
90 scheme, url = string.lower(url[:i]), url[i+1:]
91 if scheme in uses_netloc:
92 if url[:2] == '//':
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000093 i = find(url, '/', 2)
Guido van Rossum23cb2a81994-09-12 10:36:35 +000094 if i < 0:
95 i = len(url)
96 netloc, url = url[2:i], url[i:]
Jeremy Hylton4722da61998-08-25 19:45:24 +000097 if allow_fragments and scheme in uses_fragment:
Guido van Rossum23cb2a81994-09-12 10:36:35 +000098 i = string.rfind(url, '#')
99 if i >= 0:
100 url, fragment = url[:i], url[i+1:]
101 if scheme in uses_query:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +0000102 i = find(url, '?')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000103 if i >= 0:
104 url, query = url[:i], url[i+1:]
105 if scheme in uses_params:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +0000106 i = find(url, ';')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000107 if i >= 0:
108 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000109 tuple = scheme, netloc, url, params, query, fragment
110 _parse_cache[key] = tuple
111 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000112
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000113def urlunparse((scheme, netloc, url, params, query, fragment)):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000114 """Put a parsed URL back together again. This may result in a
115 slightly different, but equivalent URL, if the URL that was parsed
116 originally had redundant delimiters, e.g. a ? with an empty query
117 (the draft states that these are equivalent)."""
Guido van Rossumf3963b11999-03-18 15:10:44 +0000118 if netloc or (scheme in uses_netloc and url[:2] == '//'):
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000119 if url[:1] != '/': url = '/' + url
Guido van Rossumf3963b11999-03-18 15:10:44 +0000120 url = '//' + (netloc or '') + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000121 if scheme:
122 url = scheme + ':' + url
123 if params:
124 url = url + ';' + params
125 if query:
126 url = url + '?' + query
127 if fragment:
128 url = url + '#' + fragment
129 return url
130
Jeremy Hylton4722da61998-08-25 19:45:24 +0000131def urljoin(base, url, allow_fragments = 1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000132 """Join a base URL and a possibly relative URL to form an absolute
133 interpretation of the latter."""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000134 if not base:
135 return url
136 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000137 urlparse(base, '', allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000138 scheme, netloc, path, params, query, fragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000139 urlparse(url, bscheme, allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000140 if scheme != bscheme or scheme not in uses_relative:
141 return urlunparse((scheme, netloc, path,
142 params, query, fragment))
143 if scheme in uses_netloc:
144 if netloc:
145 return urlunparse((scheme, netloc, path,
146 params, query, fragment))
147 netloc = bnetloc
148 if path[:1] == '/':
149 return urlunparse((scheme, netloc, path,
150 params, query, fragment))
151 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000152 return urlunparse((scheme, netloc, bpath,
153 params, query or bquery, fragment))
154 i = rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000155 if i >= 0:
156 path = bpath[:i] + '/' + path
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000157 segments = splitfields(path, '/')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000158 if segments[-1] == '.':
159 segments[-1] = ''
160 while '.' in segments:
161 segments.remove('.')
162 while 1:
163 i = 1
164 n = len(segments) - 1
165 while i < n:
166 if segments[i] == '..' and segments[i-1]:
167 del segments[i-1:i+1]
168 break
169 i = i+1
170 else:
171 break
Guido van Rossume612be51997-12-03 22:38:56 +0000172 if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
173 segments[-1] = ''
174 elif len(segments) >= 2 and segments[-1] == '..':
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000175 segments[-2:] = ['']
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000176 return urlunparse((scheme, netloc, joinfields(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000177 params, query, fragment))
178
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000179def urldefrag(url):
180 """Removes any existing fragment from URL.
181
182 Returns a tuple of the defragmented URL and the fragment. If
183 the URL contained no fragments, the second element is the
184 empty string.
185 """
186 s, n, p, a, q, frag = urlparse(url)
187 defrag = urlunparse((s, n, p, a, q, ''))
188 return defrag, frag
189
190
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000191test_input = """
192 http://a/b/c/d
193
194 g:h = <URL:g:h>
195 http:g = <URL:http://a/b/c/g>
196 http: = <URL:http://a/b/c/d>
197 g = <URL:http://a/b/c/g>
198 ./g = <URL:http://a/b/c/g>
199 g/ = <URL:http://a/b/c/g/>
200 /g = <URL:http://a/g>
201 //g = <URL:http://g>
202 ?y = <URL:http://a/b/c/d?y>
203 g?y = <URL:http://a/b/c/g?y>
204 g?y/./x = <URL:http://a/b/c/g?y/./x>
205 . = <URL:http://a/b/c/>
206 ./ = <URL:http://a/b/c/>
207 .. = <URL:http://a/b/>
208 ../ = <URL:http://a/b/>
209 ../g = <URL:http://a/b/g>
210 ../.. = <URL:http://a/>
211 ../../g = <URL:http://a/g>
212 ../../../g = <URL:http://a/../g>
213 ./../g = <URL:http://a/b/g>
214 ./g/. = <URL:http://a/b/c/g/>
215 /./g = <URL:http://a/./g>
216 g/./h = <URL:http://a/b/c/g/h>
217 g/../h = <URL:http://a/b/c/h>
218 http:g = <URL:http://a/b/c/g>
219 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000220 http:?y = <URL:http://a/b/c/d?y>
221 http:g?y = <URL:http://a/b/c/g?y>
222 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000223"""
Guido van Rossumc08cc501998-12-21 18:24:09 +0000224# XXX The result for //g is actually http://g/; is this a problem?
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000225
226def test():
227 import sys
228 base = ''
229 if sys.argv[1:]:
230 fn = sys.argv[1]
231 if fn == '-':
232 fp = sys.stdin
233 else:
234 fp = open(fn)
235 else:
236 import StringIO
237 fp = StringIO.StringIO(test_input)
238 while 1:
239 line = fp.readline()
240 if not line: break
241 words = string.split(line)
242 if not words:
243 continue
244 url = words[0]
245 parts = urlparse(url)
246 print '%-10s : %s' % (url, parts)
247 abs = urljoin(base, url)
248 if not base:
249 base = abs
250 wrapped = '<URL:%s>' % abs
251 print '%-10s = %s' % (url, wrapped)
252 if len(words) == 3 and words[1] == '=':
253 if wrapped != words[2]:
254 print 'EXPECTED', words[2], '!!!!!!!!!!'
255
256if __name__ == '__main__':
257 test()