blob: af41a7af096221a3d713b572b8820aadccf660d9 [file] [log] [blame]
Guido van Rossumededb581996-03-29 21:23:25 +00001# Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2# Resource Locators", by R. Fielding, UC Irvine, June 1995.
Guido van Rossum23cb2a81994-09-12 10:36:35 +00003
4# Standard/builtin Python modules
5import string
Guido van Rossum3fd32ec1996-05-28 23:54:24 +00006from string import joinfields, splitfields, find, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007
8# A classification of schemes ('' means apply by default)
9uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000010 'https', 'shttp',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000011 'prospero', '']
12uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossum974e32d1999-02-22 15:38:46 +000013 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000014 'https', 'shttp', 'snews',
Guido van Rossum1a16c861995-08-10 19:45:41 +000015 'prospero', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000016non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
17 'snews',
18 ]
19uses_params = ['ftp', 'hdl', 'prospero', 'http',
20 'https', 'shttp',
21 '']
22uses_query = ['http', 'wais',
23 'https', 'shttp',
Guido van Rossumf7edadb1998-01-19 22:27:21 +000024 'gopher',
Guido van Rossumb02092a1997-01-02 18:18:27 +000025 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000026uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000027 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000028 'file', 'prospero', '']
29
30# Characters valid in scheme names
31scheme_chars = string.letters + string.digits + '+-.'
32
Guido van Rossum74495401997-07-14 19:08:15 +000033MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000034_parse_cache = {}
35
36def clear_cache():
Guido van Rossum671dc201996-12-27 15:26:15 +000037 """Clear the parse cache."""
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000038 global _parse_cache
39 _parse_cache = {}
40
41
Guido van Rossum23cb2a81994-09-12 10:36:35 +000042# Parse a URL into 6 components:
43# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
44# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
45# Note that we don't break the components up in smaller bits
46# (e.g. netloc is a single string) and we don't expand % escapes.
Jeremy Hylton4722da61998-08-25 19:45:24 +000047def urlparse(url, scheme = '', allow_fragments = 1):
48 key = url, scheme, allow_fragments
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000049 cached = _parse_cache.get(key, None)
50 if cached:
51 return cached
Guido van Rossum671dc201996-12-27 15:26:15 +000052 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
53 clear_cache()
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000054 find = string.find
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000055 netloc = path = params = query = fragment = ''
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000056 i = find(url, ':')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000057 if i > 0:
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000058 if url[:i] == 'http': # optimize the common case
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000059 scheme = string.lower(url[:i])
60 url = url[i+1:]
61 if url[:2] == '//':
62 i = find(url, '/', 2)
63 if i < 0:
64 i = len(url)
65 netloc = url[2:i]
66 url = url[i:]
67 if allow_fragments:
68 i = string.rfind(url, '#')
69 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000070 fragment = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000071 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000072 i = find(url, '?')
73 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000074 query = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000075 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000076 i = find(url, ';')
77 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000078 params = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000079 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000080 tuple = scheme, netloc, url, params, query, fragment
81 _parse_cache[key] = tuple
82 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000083 for c in url[:i]:
84 if c not in scheme_chars:
85 break
86 else:
87 scheme, url = string.lower(url[:i]), url[i+1:]
88 if scheme in uses_netloc:
89 if url[:2] == '//':
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000090 i = find(url, '/', 2)
Guido van Rossum23cb2a81994-09-12 10:36:35 +000091 if i < 0:
92 i = len(url)
93 netloc, url = url[2:i], url[i:]
Jeremy Hylton4722da61998-08-25 19:45:24 +000094 if allow_fragments and scheme in uses_fragment:
Guido van Rossum23cb2a81994-09-12 10:36:35 +000095 i = string.rfind(url, '#')
96 if i >= 0:
97 url, fragment = url[:i], url[i+1:]
98 if scheme in uses_query:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000099 i = find(url, '?')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000100 if i >= 0:
101 url, query = url[:i], url[i+1:]
102 if scheme in uses_params:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +0000103 i = find(url, ';')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000104 if i >= 0:
105 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000106 tuple = scheme, netloc, url, params, query, fragment
107 _parse_cache[key] = tuple
108 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000109
110# Put a parsed URL back together again. This may result in a slightly
111# different, but equivalent URL, if the URL that was parsed originally
112# had redundant delimiters, e.g. a ? with an empty query (the draft
113# states that these are equivalent).
114def urlunparse((scheme, netloc, url, params, query, fragment)):
115 if netloc:
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000116 if url[:1] != '/': url = '/' + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000117 url = '//' + netloc + url
118 if scheme:
119 url = scheme + ':' + url
120 if params:
121 url = url + ';' + params
122 if query:
123 url = url + '?' + query
124 if fragment:
125 url = url + '#' + fragment
126 return url
127
128# Join a base URL and a possibly relative URL to form an absolute
129# interpretation of the latter.
Jeremy Hylton4722da61998-08-25 19:45:24 +0000130def urljoin(base, url, allow_fragments = 1):
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000131 if not base:
132 return url
133 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000134 urlparse(base, '', allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000135 scheme, netloc, path, params, query, fragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000136 urlparse(url, bscheme, allow_fragments)
Guido van Rossuma1124701994-12-30 17:18:59 +0000137 # XXX Unofficial hack: default netloc to bnetloc even if
138 # schemes differ
139 if scheme != bscheme and not netloc and \
140 scheme in uses_relative and bscheme in uses_relative and \
141 scheme in uses_netloc and bscheme in uses_netloc:
142 netloc = bnetloc
143 # Strip the port number
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000144 i = find(netloc, '@')
Guido van Rossuma1124701994-12-30 17:18:59 +0000145 if i < 0: i = 0
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000146 i = find(netloc, ':', i)
Guido van Rossuma1124701994-12-30 17:18:59 +0000147 if i >= 0:
148 netloc = netloc[:i]
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000149 if scheme != bscheme or scheme not in uses_relative:
150 return urlunparse((scheme, netloc, path,
151 params, query, fragment))
152 if scheme in uses_netloc:
153 if netloc:
154 return urlunparse((scheme, netloc, path,
155 params, query, fragment))
156 netloc = bnetloc
157 if path[:1] == '/':
158 return urlunparse((scheme, netloc, path,
159 params, query, fragment))
160 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000161 return urlunparse((scheme, netloc, bpath,
162 params, query or bquery, fragment))
163 i = rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000164 if i >= 0:
165 path = bpath[:i] + '/' + path
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000166 segments = splitfields(path, '/')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000167 if segments[-1] == '.':
168 segments[-1] = ''
169 while '.' in segments:
170 segments.remove('.')
171 while 1:
172 i = 1
173 n = len(segments) - 1
174 while i < n:
175 if segments[i] == '..' and segments[i-1]:
176 del segments[i-1:i+1]
177 break
178 i = i+1
179 else:
180 break
Guido van Rossume612be51997-12-03 22:38:56 +0000181 if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
182 segments[-1] = ''
183 elif len(segments) >= 2 and segments[-1] == '..':
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000184 segments[-2:] = ['']
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000185 return urlunparse((scheme, netloc, joinfields(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000186 params, query, fragment))
187
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000188def urldefrag(url):
189 """Removes any existing fragment from URL.
190
191 Returns a tuple of the defragmented URL and the fragment. If
192 the URL contained no fragments, the second element is the
193 empty string.
194 """
195 s, n, p, a, q, frag = urlparse(url)
196 defrag = urlunparse((s, n, p, a, q, ''))
197 return defrag, frag
198
199
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000200test_input = """
201 http://a/b/c/d
202
203 g:h = <URL:g:h>
204 http:g = <URL:http://a/b/c/g>
205 http: = <URL:http://a/b/c/d>
206 g = <URL:http://a/b/c/g>
207 ./g = <URL:http://a/b/c/g>
208 g/ = <URL:http://a/b/c/g/>
209 /g = <URL:http://a/g>
210 //g = <URL:http://g>
211 ?y = <URL:http://a/b/c/d?y>
212 g?y = <URL:http://a/b/c/g?y>
213 g?y/./x = <URL:http://a/b/c/g?y/./x>
214 . = <URL:http://a/b/c/>
215 ./ = <URL:http://a/b/c/>
216 .. = <URL:http://a/b/>
217 ../ = <URL:http://a/b/>
218 ../g = <URL:http://a/b/g>
219 ../.. = <URL:http://a/>
220 ../../g = <URL:http://a/g>
221 ../../../g = <URL:http://a/../g>
222 ./../g = <URL:http://a/b/g>
223 ./g/. = <URL:http://a/b/c/g/>
224 /./g = <URL:http://a/./g>
225 g/./h = <URL:http://a/b/c/g/h>
226 g/../h = <URL:http://a/b/c/h>
227 http:g = <URL:http://a/b/c/g>
228 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000229 http:?y = <URL:http://a/b/c/d?y>
230 http:g?y = <URL:http://a/b/c/g?y>
231 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000232"""
Guido van Rossumc08cc501998-12-21 18:24:09 +0000233# XXX The result for //g is actually http://g/; is this a problem?
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000234
235def test():
236 import sys
237 base = ''
238 if sys.argv[1:]:
239 fn = sys.argv[1]
240 if fn == '-':
241 fp = sys.stdin
242 else:
243 fp = open(fn)
244 else:
245 import StringIO
246 fp = StringIO.StringIO(test_input)
247 while 1:
248 line = fp.readline()
249 if not line: break
250 words = string.split(line)
251 if not words:
252 continue
253 url = words[0]
254 parts = urlparse(url)
255 print '%-10s : %s' % (url, parts)
256 abs = urljoin(base, url)
257 if not base:
258 base = abs
259 wrapped = '<URL:%s>' % abs
260 print '%-10s = %s' % (url, wrapped)
261 if len(words) == 3 and words[1] == '=':
262 if wrapped != words[2]:
263 print 'EXPECTED', words[2], '!!!!!!!!!!'
264
265if __name__ == '__main__':
266 test()