blob: dfea52d19caad14c54b87d40a957ad11b3be4d4b [file] [log] [blame]
Guido van Rossumededb581996-03-29 21:23:25 +00001# Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2# Resource Locators", by R. Fielding, UC Irvine, June 1995.
Guido van Rossum23cb2a81994-09-12 10:36:35 +00003
4# Standard/builtin Python modules
5import string
Guido van Rossum4f136691999-05-03 18:16:23 +00006from string import joinfields, splitfields, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007
8# A classification of schemes ('' means apply by default)
9uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000010 'https', 'shttp',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000011 'prospero', '']
12uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossum974e32d1999-02-22 15:38:46 +000013 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000014 'https', 'shttp', 'snews',
Guido van Rossum1a16c861995-08-10 19:45:41 +000015 'prospero', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000016non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
17 'snews',
18 ]
19uses_params = ['ftp', 'hdl', 'prospero', 'http',
20 'https', 'shttp',
21 '']
22uses_query = ['http', 'wais',
23 'https', 'shttp',
Guido van Rossumf7edadb1998-01-19 22:27:21 +000024 'gopher',
Guido van Rossumb02092a1997-01-02 18:18:27 +000025 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000026uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000027 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000028 'file', 'prospero', '']
29
30# Characters valid in scheme names
31scheme_chars = string.letters + string.digits + '+-.'
32
Guido van Rossum74495401997-07-14 19:08:15 +000033MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000034_parse_cache = {}
35
36def clear_cache():
Guido van Rossum671dc201996-12-27 15:26:15 +000037 """Clear the parse cache."""
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000038 global _parse_cache
39 _parse_cache = {}
40
41
Guido van Rossum23cb2a81994-09-12 10:36:35 +000042# Parse a URL into 6 components:
43# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
44# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
45# Note that we don't break the components up in smaller bits
46# (e.g. netloc is a single string) and we don't expand % escapes.
Jeremy Hylton4722da61998-08-25 19:45:24 +000047def urlparse(url, scheme = '', allow_fragments = 1):
48 key = url, scheme, allow_fragments
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000049 cached = _parse_cache.get(key, None)
50 if cached:
51 return cached
Guido van Rossum671dc201996-12-27 15:26:15 +000052 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
53 clear_cache()
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000054 find = string.find
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000055 netloc = path = params = query = fragment = ''
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000056 i = find(url, ':')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000057 if i > 0:
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000058 if url[:i] == 'http': # optimize the common case
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000059 scheme = string.lower(url[:i])
60 url = url[i+1:]
61 if url[:2] == '//':
62 i = find(url, '/', 2)
63 if i < 0:
64 i = len(url)
65 netloc = url[2:i]
66 url = url[i:]
67 if allow_fragments:
68 i = string.rfind(url, '#')
69 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000070 fragment = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000071 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000072 i = find(url, '?')
73 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000074 query = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000075 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000076 i = find(url, ';')
77 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000078 params = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000079 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000080 tuple = scheme, netloc, url, params, query, fragment
81 _parse_cache[key] = tuple
82 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000083 for c in url[:i]:
84 if c not in scheme_chars:
85 break
86 else:
87 scheme, url = string.lower(url[:i]), url[i+1:]
88 if scheme in uses_netloc:
89 if url[:2] == '//':
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000090 i = find(url, '/', 2)
Guido van Rossum23cb2a81994-09-12 10:36:35 +000091 if i < 0:
92 i = len(url)
93 netloc, url = url[2:i], url[i:]
Jeremy Hylton4722da61998-08-25 19:45:24 +000094 if allow_fragments and scheme in uses_fragment:
Guido van Rossum23cb2a81994-09-12 10:36:35 +000095 i = string.rfind(url, '#')
96 if i >= 0:
97 url, fragment = url[:i], url[i+1:]
98 if scheme in uses_query:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000099 i = find(url, '?')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000100 if i >= 0:
101 url, query = url[:i], url[i+1:]
102 if scheme in uses_params:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +0000103 i = find(url, ';')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000104 if i >= 0:
105 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000106 tuple = scheme, netloc, url, params, query, fragment
107 _parse_cache[key] = tuple
108 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000109
110# Put a parsed URL back together again. This may result in a slightly
111# different, but equivalent URL, if the URL that was parsed originally
112# had redundant delimiters, e.g. a ? with an empty query (the draft
113# states that these are equivalent).
114def urlunparse((scheme, netloc, url, params, query, fragment)):
Guido van Rossumf3963b11999-03-18 15:10:44 +0000115 if netloc or (scheme in uses_netloc and url[:2] == '//'):
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000116 if url[:1] != '/': url = '/' + url
Guido van Rossumf3963b11999-03-18 15:10:44 +0000117 url = '//' + (netloc or '') + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000118 if scheme:
119 url = scheme + ':' + url
120 if params:
121 url = url + ';' + params
122 if query:
123 url = url + '?' + query
124 if fragment:
125 url = url + '#' + fragment
126 return url
127
128# Join a base URL and a possibly relative URL to form an absolute
129# interpretation of the latter.
Jeremy Hylton4722da61998-08-25 19:45:24 +0000130def urljoin(base, url, allow_fragments = 1):
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000131 if not base:
132 return url
133 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000134 urlparse(base, '', allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000135 scheme, netloc, path, params, query, fragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000136 urlparse(url, bscheme, allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000137 if scheme != bscheme or scheme not in uses_relative:
138 return urlunparse((scheme, netloc, path,
139 params, query, fragment))
140 if scheme in uses_netloc:
141 if netloc:
142 return urlunparse((scheme, netloc, path,
143 params, query, fragment))
144 netloc = bnetloc
145 if path[:1] == '/':
146 return urlunparse((scheme, netloc, path,
147 params, query, fragment))
148 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000149 return urlunparse((scheme, netloc, bpath,
150 params, query or bquery, fragment))
151 i = rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000152 if i >= 0:
153 path = bpath[:i] + '/' + path
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000154 segments = splitfields(path, '/')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000155 if segments[-1] == '.':
156 segments[-1] = ''
157 while '.' in segments:
158 segments.remove('.')
159 while 1:
160 i = 1
161 n = len(segments) - 1
162 while i < n:
163 if segments[i] == '..' and segments[i-1]:
164 del segments[i-1:i+1]
165 break
166 i = i+1
167 else:
168 break
Guido van Rossume612be51997-12-03 22:38:56 +0000169 if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
170 segments[-1] = ''
171 elif len(segments) >= 2 and segments[-1] == '..':
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000172 segments[-2:] = ['']
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000173 return urlunparse((scheme, netloc, joinfields(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000174 params, query, fragment))
175
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000176def urldefrag(url):
177 """Removes any existing fragment from URL.
178
179 Returns a tuple of the defragmented URL and the fragment. If
180 the URL contained no fragments, the second element is the
181 empty string.
182 """
183 s, n, p, a, q, frag = urlparse(url)
184 defrag = urlunparse((s, n, p, a, q, ''))
185 return defrag, frag
186
187
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000188test_input = """
189 http://a/b/c/d
190
191 g:h = <URL:g:h>
192 http:g = <URL:http://a/b/c/g>
193 http: = <URL:http://a/b/c/d>
194 g = <URL:http://a/b/c/g>
195 ./g = <URL:http://a/b/c/g>
196 g/ = <URL:http://a/b/c/g/>
197 /g = <URL:http://a/g>
198 //g = <URL:http://g>
199 ?y = <URL:http://a/b/c/d?y>
200 g?y = <URL:http://a/b/c/g?y>
201 g?y/./x = <URL:http://a/b/c/g?y/./x>
202 . = <URL:http://a/b/c/>
203 ./ = <URL:http://a/b/c/>
204 .. = <URL:http://a/b/>
205 ../ = <URL:http://a/b/>
206 ../g = <URL:http://a/b/g>
207 ../.. = <URL:http://a/>
208 ../../g = <URL:http://a/g>
209 ../../../g = <URL:http://a/../g>
210 ./../g = <URL:http://a/b/g>
211 ./g/. = <URL:http://a/b/c/g/>
212 /./g = <URL:http://a/./g>
213 g/./h = <URL:http://a/b/c/g/h>
214 g/../h = <URL:http://a/b/c/h>
215 http:g = <URL:http://a/b/c/g>
216 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000217 http:?y = <URL:http://a/b/c/d?y>
218 http:g?y = <URL:http://a/b/c/g?y>
219 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000220"""
Guido van Rossumc08cc501998-12-21 18:24:09 +0000221# XXX The result for //g is actually http://g/; is this a problem?
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000222
223def test():
224 import sys
225 base = ''
226 if sys.argv[1:]:
227 fn = sys.argv[1]
228 if fn == '-':
229 fp = sys.stdin
230 else:
231 fp = open(fn)
232 else:
233 import StringIO
234 fp = StringIO.StringIO(test_input)
235 while 1:
236 line = fp.readline()
237 if not line: break
238 words = string.split(line)
239 if not words:
240 continue
241 url = words[0]
242 parts = urlparse(url)
243 print '%-10s : %s' % (url, parts)
244 abs = urljoin(base, url)
245 if not base:
246 base = abs
247 wrapped = '<URL:%s>' % abs
248 print '%-10s = %s' % (url, wrapped)
249 if len(words) == 3 and words[1] == '=':
250 if wrapped != words[2]:
251 print 'EXPECTED', words[2], '!!!!!!!!!!'
252
253if __name__ == '__main__':
254 test()