blob: 148633e954e8e37891a65caf667c9071e6348674 [file] [log] [blame]
Guido van Rossumededb581996-03-29 21:23:25 +00001# Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2# Resource Locators", by R. Fielding, UC Irvine, June 1995.
Guido van Rossum23cb2a81994-09-12 10:36:35 +00003
4# Standard/builtin Python modules
5import string
Guido van Rossum3fd32ec1996-05-28 23:54:24 +00006from string import joinfields, splitfields, find, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007
8# A classification of schemes ('' means apply by default)
9uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000010 'https', 'shttp',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000011 'prospero', '']
12uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000013 'https', 'shttp', 'snews',
Guido van Rossum1a16c861995-08-10 19:45:41 +000014 'prospero', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000015non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
16 'snews',
17 ]
18uses_params = ['ftp', 'hdl', 'prospero', 'http',
19 'https', 'shttp',
20 '']
21uses_query = ['http', 'wais',
22 'https', 'shttp',
Guido van Rossumf7edadb1998-01-19 22:27:21 +000023 'gopher',
Guido van Rossumb02092a1997-01-02 18:18:27 +000024 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000025uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000026 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000027 'file', 'prospero', '']
28
29# Characters valid in scheme names
30scheme_chars = string.letters + string.digits + '+-.'
31
Guido van Rossum74495401997-07-14 19:08:15 +000032MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000033_parse_cache = {}
34
35def clear_cache():
Guido van Rossum671dc201996-12-27 15:26:15 +000036 """Clear the parse cache."""
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000037 global _parse_cache
38 _parse_cache = {}
39
40
Guido van Rossum23cb2a81994-09-12 10:36:35 +000041# Parse a URL into 6 components:
42# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
43# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
44# Note that we don't break the components up in smaller bits
45# (e.g. netloc is a single string) and we don't expand % escapes.
Jeremy Hylton4722da61998-08-25 19:45:24 +000046def urlparse(url, scheme = '', allow_fragments = 1):
47 key = url, scheme, allow_fragments
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000048 cached = _parse_cache.get(key, None)
49 if cached:
50 return cached
Guido van Rossum671dc201996-12-27 15:26:15 +000051 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
52 clear_cache()
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000053 find = string.find
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000054 netloc = path = params = query = fragment = ''
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000055 i = find(url, ':')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000056 if i > 0:
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000057 if url[:i] == 'http': # optimize the common case
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000058 scheme = string.lower(url[:i])
59 url = url[i+1:]
60 if url[:2] == '//':
61 i = find(url, '/', 2)
62 if i < 0:
63 i = len(url)
64 netloc = url[2:i]
65 url = url[i:]
66 if allow_fragments:
67 i = string.rfind(url, '#')
68 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000069 fragment = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000070 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000071 i = find(url, '?')
72 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000073 query = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000074 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000075 i = find(url, ';')
76 if i >= 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000077 params = url[i+1:]
Andrew M. Kuchling5c355201999-01-06 22:13:09 +000078 url = url[:i]
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000079 tuple = scheme, netloc, url, params, query, fragment
80 _parse_cache[key] = tuple
81 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000082 for c in url[:i]:
83 if c not in scheme_chars:
84 break
85 else:
86 scheme, url = string.lower(url[:i]), url[i+1:]
87 if scheme in uses_netloc:
88 if url[:2] == '//':
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000089 i = find(url, '/', 2)
Guido van Rossum23cb2a81994-09-12 10:36:35 +000090 if i < 0:
91 i = len(url)
92 netloc, url = url[2:i], url[i:]
Jeremy Hylton4722da61998-08-25 19:45:24 +000093 if allow_fragments and scheme in uses_fragment:
Guido van Rossum23cb2a81994-09-12 10:36:35 +000094 i = string.rfind(url, '#')
95 if i >= 0:
96 url, fragment = url[:i], url[i+1:]
97 if scheme in uses_query:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000098 i = find(url, '?')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000099 if i >= 0:
100 url, query = url[:i], url[i+1:]
101 if scheme in uses_params:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +0000102 i = find(url, ';')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000103 if i >= 0:
104 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000105 tuple = scheme, netloc, url, params, query, fragment
106 _parse_cache[key] = tuple
107 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000108
109# Put a parsed URL back together again. This may result in a slightly
110# different, but equivalent URL, if the URL that was parsed originally
111# had redundant delimiters, e.g. a ? with an empty query (the draft
112# states that these are equivalent).
113def urlunparse((scheme, netloc, url, params, query, fragment)):
114 if netloc:
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000115 if url[:1] != '/': url = '/' + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000116 url = '//' + netloc + url
117 if scheme:
118 url = scheme + ':' + url
119 if params:
120 url = url + ';' + params
121 if query:
122 url = url + '?' + query
123 if fragment:
124 url = url + '#' + fragment
125 return url
126
127# Join a base URL and a possibly relative URL to form an absolute
128# interpretation of the latter.
Jeremy Hylton4722da61998-08-25 19:45:24 +0000129def urljoin(base, url, allow_fragments = 1):
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000130 if not base:
131 return url
132 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000133 urlparse(base, '', allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000134 scheme, netloc, path, params, query, fragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000135 urlparse(url, bscheme, allow_fragments)
Guido van Rossuma1124701994-12-30 17:18:59 +0000136 # XXX Unofficial hack: default netloc to bnetloc even if
137 # schemes differ
138 if scheme != bscheme and not netloc and \
139 scheme in uses_relative and bscheme in uses_relative and \
140 scheme in uses_netloc and bscheme in uses_netloc:
141 netloc = bnetloc
142 # Strip the port number
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000143 i = find(netloc, '@')
Guido van Rossuma1124701994-12-30 17:18:59 +0000144 if i < 0: i = 0
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000145 i = find(netloc, ':', i)
Guido van Rossuma1124701994-12-30 17:18:59 +0000146 if i >= 0:
147 netloc = netloc[:i]
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000148 if scheme != bscheme or scheme not in uses_relative:
149 return urlunparse((scheme, netloc, path,
150 params, query, fragment))
151 if scheme in uses_netloc:
152 if netloc:
153 return urlunparse((scheme, netloc, path,
154 params, query, fragment))
155 netloc = bnetloc
156 if path[:1] == '/':
157 return urlunparse((scheme, netloc, path,
158 params, query, fragment))
159 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000160 return urlunparse((scheme, netloc, bpath,
161 params, query or bquery, fragment))
162 i = rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000163 if i >= 0:
164 path = bpath[:i] + '/' + path
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000165 segments = splitfields(path, '/')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000166 if segments[-1] == '.':
167 segments[-1] = ''
168 while '.' in segments:
169 segments.remove('.')
170 while 1:
171 i = 1
172 n = len(segments) - 1
173 while i < n:
174 if segments[i] == '..' and segments[i-1]:
175 del segments[i-1:i+1]
176 break
177 i = i+1
178 else:
179 break
Guido van Rossume612be51997-12-03 22:38:56 +0000180 if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
181 segments[-1] = ''
182 elif len(segments) >= 2 and segments[-1] == '..':
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000183 segments[-2:] = ['']
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000184 return urlunparse((scheme, netloc, joinfields(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000185 params, query, fragment))
186
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000187def urldefrag(url):
188 """Removes any existing fragment from URL.
189
190 Returns a tuple of the defragmented URL and the fragment. If
191 the URL contained no fragments, the second element is the
192 empty string.
193 """
194 s, n, p, a, q, frag = urlparse(url)
195 defrag = urlunparse((s, n, p, a, q, ''))
196 return defrag, frag
197
198
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000199test_input = """
200 http://a/b/c/d
201
202 g:h = <URL:g:h>
203 http:g = <URL:http://a/b/c/g>
204 http: = <URL:http://a/b/c/d>
205 g = <URL:http://a/b/c/g>
206 ./g = <URL:http://a/b/c/g>
207 g/ = <URL:http://a/b/c/g/>
208 /g = <URL:http://a/g>
209 //g = <URL:http://g>
210 ?y = <URL:http://a/b/c/d?y>
211 g?y = <URL:http://a/b/c/g?y>
212 g?y/./x = <URL:http://a/b/c/g?y/./x>
213 . = <URL:http://a/b/c/>
214 ./ = <URL:http://a/b/c/>
215 .. = <URL:http://a/b/>
216 ../ = <URL:http://a/b/>
217 ../g = <URL:http://a/b/g>
218 ../.. = <URL:http://a/>
219 ../../g = <URL:http://a/g>
220 ../../../g = <URL:http://a/../g>
221 ./../g = <URL:http://a/b/g>
222 ./g/. = <URL:http://a/b/c/g/>
223 /./g = <URL:http://a/./g>
224 g/./h = <URL:http://a/b/c/g/h>
225 g/../h = <URL:http://a/b/c/h>
226 http:g = <URL:http://a/b/c/g>
227 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000228 http:?y = <URL:http://a/b/c/d?y>
229 http:g?y = <URL:http://a/b/c/g?y>
230 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000231"""
Guido van Rossumc08cc501998-12-21 18:24:09 +0000232# XXX The result for //g is actually http://g/; is this a problem?
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000233
234def test():
235 import sys
236 base = ''
237 if sys.argv[1:]:
238 fn = sys.argv[1]
239 if fn == '-':
240 fp = sys.stdin
241 else:
242 fp = open(fn)
243 else:
244 import StringIO
245 fp = StringIO.StringIO(test_input)
246 while 1:
247 line = fp.readline()
248 if not line: break
249 words = string.split(line)
250 if not words:
251 continue
252 url = words[0]
253 parts = urlparse(url)
254 print '%-10s : %s' % (url, parts)
255 abs = urljoin(base, url)
256 if not base:
257 base = abs
258 wrapped = '<URL:%s>' % abs
259 print '%-10s = %s' % (url, wrapped)
260 if len(words) == 3 and words[1] == '=':
261 if wrapped != words[2]:
262 print 'EXPECTED', words[2], '!!!!!!!!!!'
263
264if __name__ == '__main__':
265 test()