blob: d927b7e777dcc404932fd6bc055420a4cda018d9 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +00006
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007# A classification of schemes ('' means apply by default)
8uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +00009 'https', 'shttp',
Fred Drake05565012000-04-14 14:01:34 +000010 'prospero', 'rtsp', 'rtspu', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000011uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossum974e32d1999-02-22 15:38:46 +000012 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000013 'https', 'shttp', 'snews',
Fred Drake05565012000-04-14 14:01:34 +000014 'prospero', 'rtsp', 'rtspu', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000015non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
Fred Drakebdd44a32000-06-20 18:32:16 +000016 'snews', 'sip',
Guido van Rossumb02092a1997-01-02 18:18:27 +000017 ]
18uses_params = ['ftp', 'hdl', 'prospero', 'http',
Fred Drakebdd44a32000-06-20 18:32:16 +000019 'https', 'shttp', 'rtsp', 'rtspu', 'sip',
Guido van Rossumb02092a1997-01-02 18:18:27 +000020 '']
21uses_query = ['http', 'wais',
22 'https', 'shttp',
Fred Drakebdd44a32000-06-20 18:32:16 +000023 'gopher', 'rtsp', 'rtspu', 'sip',
Guido van Rossumb02092a1997-01-02 18:18:27 +000024 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000025uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000026 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000027 'file', 'prospero', '']
28
29# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000030scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
32 '0123456789'
33 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034
Guido van Rossum74495401997-07-14 19:08:15 +000035MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000036_parse_cache = {}
37
38def clear_cache():
Guido van Rossuma25d7dd2000-04-10 17:02:46 +000039 """Clear the parse cache."""
40 global _parse_cache
41 _parse_cache = {}
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000042
43
Jeremy Hylton4722da61998-08-25 19:45:24 +000044def urlparse(url, scheme = '', allow_fragments = 1):
Guido van Rossume7b146f2000-02-04 15:28:42 +000045 """Parse a URL into 6 components:
46 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
47 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
48 Note that we don't break the components up in smaller bits
49 (e.g. netloc is a single string) and we don't expand % escapes."""
Jeremy Hylton4722da61998-08-25 19:45:24 +000050 key = url, scheme, allow_fragments
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000051 cached = _parse_cache.get(key, None)
52 if cached:
53 return cached
Guido van Rossum671dc201996-12-27 15:26:15 +000054 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
Guido van Rossuma25d7dd2000-04-10 17:02:46 +000055 clear_cache()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000056 netloc = path = params = query = fragment = ''
Guido van Rossumfad81f02000-12-19 16:48:13 +000057 i = url.find(':')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000058 if i > 0:
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000059 if url[:i] == 'http': # optimize the common case
Guido van Rossumfad81f02000-12-19 16:48:13 +000060 scheme = url[:i].lower()
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000061 url = url[i+1:]
62 if url[:2] == '//':
Guido van Rossumfad81f02000-12-19 16:48:13 +000063 i = url.find('/', 2)
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000064 if i < 0:
65 i = len(url)
66 netloc = url[2:i]
67 url = url[i:]
68 if allow_fragments:
Guido van Rossumfad81f02000-12-19 16:48:13 +000069 i = url.rfind('#')
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000070 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000071 fragment = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000072 url = url[:i]
Guido van Rossumfad81f02000-12-19 16:48:13 +000073 i = url.find('?')
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000074 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000075 query = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000076 url = url[:i]
Guido van Rossumfad81f02000-12-19 16:48:13 +000077 i = url.find(';')
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000078 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000079 params = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000080 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000081 tuple = scheme, netloc, url, params, query, fragment
82 _parse_cache[key] = tuple
83 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000084 for c in url[:i]:
85 if c not in scheme_chars:
86 break
87 else:
Guido van Rossumfad81f02000-12-19 16:48:13 +000088 scheme, url = url[:i].lower(), url[i+1:]
Guido van Rossum23cb2a81994-09-12 10:36:35 +000089 if scheme in uses_netloc:
90 if url[:2] == '//':
Guido van Rossumfad81f02000-12-19 16:48:13 +000091 i = url.find('/', 2)
Guido van Rossum23cb2a81994-09-12 10:36:35 +000092 if i < 0:
93 i = len(url)
94 netloc, url = url[2:i], url[i:]
Jeremy Hylton4722da61998-08-25 19:45:24 +000095 if allow_fragments and scheme in uses_fragment:
Guido van Rossumfad81f02000-12-19 16:48:13 +000096 i = url.rfind('#')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000097 if i >= 0:
98 url, fragment = url[:i], url[i+1:]
99 if scheme in uses_query:
Guido van Rossumfad81f02000-12-19 16:48:13 +0000100 i = url.find('?')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000101 if i >= 0:
102 url, query = url[:i], url[i+1:]
103 if scheme in uses_params:
Guido van Rossumfad81f02000-12-19 16:48:13 +0000104 i = url.find(';')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000105 if i >= 0:
106 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000107 tuple = scheme, netloc, url, params, query, fragment
108 _parse_cache[key] = tuple
109 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000110
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000111def urlunparse((scheme, netloc, url, params, query, fragment)):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000112 """Put a parsed URL back together again. This may result in a
113 slightly different, but equivalent URL, if the URL that was parsed
114 originally had redundant delimiters, e.g. a ? with an empty query
115 (the draft states that these are equivalent)."""
Guido van Rossumf3963b11999-03-18 15:10:44 +0000116 if netloc or (scheme in uses_netloc and url[:2] == '//'):
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000117 if url[:1] != '/': url = '/' + url
Guido van Rossumf3963b11999-03-18 15:10:44 +0000118 url = '//' + (netloc or '') + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000119 if scheme:
120 url = scheme + ':' + url
121 if params:
122 url = url + ';' + params
123 if query:
124 url = url + '?' + query
125 if fragment:
126 url = url + '#' + fragment
127 return url
128
Jeremy Hylton4722da61998-08-25 19:45:24 +0000129def urljoin(base, url, allow_fragments = 1):
Guido van Rossume7b146f2000-02-04 15:28:42 +0000130 """Join a base URL and a possibly relative URL to form an absolute
131 interpretation of the latter."""
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000132 if not base:
133 return url
134 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000135 urlparse(base, '', allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000136 scheme, netloc, path, params, query, fragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000137 urlparse(url, bscheme, allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000138 if scheme != bscheme or scheme not in uses_relative:
139 return urlunparse((scheme, netloc, path,
140 params, query, fragment))
141 if scheme in uses_netloc:
142 if netloc:
143 return urlunparse((scheme, netloc, path,
144 params, query, fragment))
145 netloc = bnetloc
146 if path[:1] == '/':
147 return urlunparse((scheme, netloc, path,
148 params, query, fragment))
149 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000150 return urlunparse((scheme, netloc, bpath,
151 params, query or bquery, fragment))
Guido van Rossumfad81f02000-12-19 16:48:13 +0000152 segments = bpath.split('/')[:-1] + path.split('/')
Guido van Rossuma25d7dd2000-04-10 17:02:46 +0000153 # XXX The stuff below is bogus in various ways...
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000154 if segments[-1] == '.':
155 segments[-1] = ''
156 while '.' in segments:
157 segments.remove('.')
158 while 1:
159 i = 1
160 n = len(segments) - 1
161 while i < n:
162 if segments[i] == '..' and segments[i-1]:
163 del segments[i-1:i+1]
164 break
165 i = i+1
166 else:
167 break
Guido van Rossume612be51997-12-03 22:38:56 +0000168 if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
169 segments[-1] = ''
170 elif len(segments) >= 2 and segments[-1] == '..':
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000171 segments[-2:] = ['']
Guido van Rossumfad81f02000-12-19 16:48:13 +0000172 return urlunparse((scheme, netloc, '/'.join(segments),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000173 params, query, fragment))
174
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000175def urldefrag(url):
Guido van Rossuma25d7dd2000-04-10 17:02:46 +0000176 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000177
Guido van Rossuma25d7dd2000-04-10 17:02:46 +0000178 Returns a tuple of the defragmented URL and the fragment. If
179 the URL contained no fragments, the second element is the
180 empty string.
181 """
182 s, n, p, a, q, frag = urlparse(url)
183 defrag = urlunparse((s, n, p, a, q, ''))
184 return defrag, frag
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000185
186
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000187test_input = """
188 http://a/b/c/d
189
190 g:h = <URL:g:h>
191 http:g = <URL:http://a/b/c/g>
192 http: = <URL:http://a/b/c/d>
193 g = <URL:http://a/b/c/g>
194 ./g = <URL:http://a/b/c/g>
195 g/ = <URL:http://a/b/c/g/>
196 /g = <URL:http://a/g>
197 //g = <URL:http://g>
198 ?y = <URL:http://a/b/c/d?y>
199 g?y = <URL:http://a/b/c/g?y>
200 g?y/./x = <URL:http://a/b/c/g?y/./x>
201 . = <URL:http://a/b/c/>
202 ./ = <URL:http://a/b/c/>
203 .. = <URL:http://a/b/>
204 ../ = <URL:http://a/b/>
205 ../g = <URL:http://a/b/g>
206 ../.. = <URL:http://a/>
207 ../../g = <URL:http://a/g>
208 ../../../g = <URL:http://a/../g>
209 ./../g = <URL:http://a/b/g>
210 ./g/. = <URL:http://a/b/c/g/>
211 /./g = <URL:http://a/./g>
212 g/./h = <URL:http://a/b/c/g/h>
213 g/../h = <URL:http://a/b/c/h>
214 http:g = <URL:http://a/b/c/g>
215 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000216 http:?y = <URL:http://a/b/c/d?y>
217 http:g?y = <URL:http://a/b/c/g?y>
218 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000219"""
Guido van Rossumc08cc501998-12-21 18:24:09 +0000220# XXX The result for //g is actually http://g/; is this a problem?
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000221
222def test():
223 import sys
224 base = ''
225 if sys.argv[1:]:
226 fn = sys.argv[1]
227 if fn == '-':
228 fp = sys.stdin
229 else:
230 fp = open(fn)
231 else:
232 import StringIO
233 fp = StringIO.StringIO(test_input)
234 while 1:
235 line = fp.readline()
236 if not line: break
Guido van Rossumfad81f02000-12-19 16:48:13 +0000237 words = line.split()
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000238 if not words:
239 continue
240 url = words[0]
241 parts = urlparse(url)
242 print '%-10s : %s' % (url, parts)
243 abs = urljoin(base, url)
244 if not base:
245 base = abs
246 wrapped = '<URL:%s>' % abs
247 print '%-10s = %s' % (url, wrapped)
248 if len(words) == 3 and words[1] == '=':
249 if wrapped != words[2]:
250 print 'EXPECTED', words[2], '!!!!!!!!!!'
251
252if __name__ == '__main__':
253 test()