blob: fe2c8ded95e89badfc2b2c6c1d3f1c3fcf509708 [file] [log] [blame]
Guido van Rossumededb581996-03-29 21:23:25 +00001# Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2# Resource Locators", by R. Fielding, UC Irvine, June 1995.
Guido van Rossum23cb2a81994-09-12 10:36:35 +00003
4# Standard/builtin Python modules
5import string
Guido van Rossum3fd32ec1996-05-28 23:54:24 +00006from string import joinfields, splitfields, find, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007
8# A classification of schemes ('' means apply by default)
9uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000010 'https', 'shttp',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000011 'prospero', '']
12uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000013 'https', 'shttp', 'snews',
Guido van Rossum1a16c861995-08-10 19:45:41 +000014 'prospero', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000015non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
16 'snews',
17 ]
18uses_params = ['ftp', 'hdl', 'prospero', 'http',
19 'https', 'shttp',
20 '']
21uses_query = ['http', 'wais',
22 'https', 'shttp',
Guido van Rossumf7edadb1998-01-19 22:27:21 +000023 'gopher',
Guido van Rossumb02092a1997-01-02 18:18:27 +000024 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000025uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000026 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000027 'file', 'prospero', '']
28
29# Characters valid in scheme names
30scheme_chars = string.letters + string.digits + '+-.'
31
Guido van Rossum74495401997-07-14 19:08:15 +000032MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000033_parse_cache = {}
34
35def clear_cache():
Guido van Rossum671dc201996-12-27 15:26:15 +000036 """Clear the parse cache."""
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000037 global _parse_cache
38 _parse_cache = {}
39
40
Guido van Rossum23cb2a81994-09-12 10:36:35 +000041# Parse a URL into 6 components:
42# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
43# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
44# Note that we don't break the components up in smaller bits
45# (e.g. netloc is a single string) and we don't expand % escapes.
Jeremy Hylton4722da61998-08-25 19:45:24 +000046def urlparse(url, scheme = '', allow_fragments = 1):
47 key = url, scheme, allow_fragments
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000048 cached = _parse_cache.get(key, None)
49 if cached:
50 return cached
Guido van Rossum671dc201996-12-27 15:26:15 +000051 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
52 clear_cache()
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000053 find = string.find
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000054 netloc = path = params = query = fragment = ''
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000055 i = find(url, ':')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000056 if i > 0:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000057 if url[:i] == 'http': # optimizie the common case
58 scheme = string.lower(url[:i])
59 url = url[i+1:]
60 if url[:2] == '//':
61 i = find(url, '/', 2)
62 if i < 0:
63 i = len(url)
64 netloc = url[2:i]
65 url = url[i:]
66 if allow_fragments:
67 i = string.rfind(url, '#')
68 if i >= 0:
69 url = url[:i]
70 fragment = url[i+1:]
71 i = find(url, '?')
72 if i >= 0:
73 url = url[:i]
74 query = url[i+1:]
75 i = find(url, ';')
76 if i >= 0:
77 url = url[:i]
78 params = url[i+1:]
79 tuple = scheme, netloc, url, params, query, fragment
80 _parse_cache[key] = tuple
81 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000082 for c in url[:i]:
83 if c not in scheme_chars:
84 break
85 else:
86 scheme, url = string.lower(url[:i]), url[i+1:]
87 if scheme in uses_netloc:
88 if url[:2] == '//':
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000089 i = find(url, '/', 2)
Guido van Rossum23cb2a81994-09-12 10:36:35 +000090 if i < 0:
91 i = len(url)
92 netloc, url = url[2:i], url[i:]
Jeremy Hylton4722da61998-08-25 19:45:24 +000093 if allow_fragments and scheme in uses_fragment:
Guido van Rossum23cb2a81994-09-12 10:36:35 +000094 i = string.rfind(url, '#')
95 if i >= 0:
96 url, fragment = url[:i], url[i+1:]
97 if scheme in uses_query:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +000098 i = find(url, '?')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000099 if i >= 0:
100 url, query = url[:i], url[i+1:]
101 if scheme in uses_params:
Jeremy Hyltonb85c8471998-09-02 21:53:16 +0000102 i = find(url, ';')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000103 if i >= 0:
104 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000105 tuple = scheme, netloc, url, params, query, fragment
106 _parse_cache[key] = tuple
107 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000108
109# Put a parsed URL back together again. This may result in a slightly
110# different, but equivalent URL, if the URL that was parsed originally
111# had redundant delimiters, e.g. a ? with an empty query (the draft
112# states that these are equivalent).
113def urlunparse((scheme, netloc, url, params, query, fragment)):
114 if netloc:
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000115 if url[:1] != '/': url = '/' + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000116 url = '//' + netloc + url
117 if scheme:
118 url = scheme + ':' + url
119 if params:
120 url = url + ';' + params
121 if query:
122 url = url + '?' + query
123 if fragment:
124 url = url + '#' + fragment
125 return url
126
127# Join a base URL and a possibly relative URL to form an absolute
128# interpretation of the latter.
Jeremy Hylton4722da61998-08-25 19:45:24 +0000129def urljoin(base, url, allow_fragments = 1):
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000130 if not base:
131 return url
132 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000133 urlparse(base, '', allow_fragments)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000134 scheme, netloc, path, params, query, fragment = \
Jeremy Hylton4722da61998-08-25 19:45:24 +0000135 urlparse(url, bscheme, allow_fragments)
Guido van Rossuma1124701994-12-30 17:18:59 +0000136 # XXX Unofficial hack: default netloc to bnetloc even if
137 # schemes differ
138 if scheme != bscheme and not netloc and \
139 scheme in uses_relative and bscheme in uses_relative and \
140 scheme in uses_netloc and bscheme in uses_netloc:
141 netloc = bnetloc
142 # Strip the port number
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000143 i = find(netloc, '@')
Guido van Rossuma1124701994-12-30 17:18:59 +0000144 if i < 0: i = 0
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000145 i = find(netloc, ':', i)
Guido van Rossuma1124701994-12-30 17:18:59 +0000146 if i >= 0:
147 netloc = netloc[:i]
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000148 if scheme != bscheme or scheme not in uses_relative:
149 return urlunparse((scheme, netloc, path,
150 params, query, fragment))
151 if scheme in uses_netloc:
152 if netloc:
153 return urlunparse((scheme, netloc, path,
154 params, query, fragment))
155 netloc = bnetloc
156 if path[:1] == '/':
157 return urlunparse((scheme, netloc, path,
158 params, query, fragment))
159 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000160 return urlunparse((scheme, netloc, bpath,
161 params, query or bquery, fragment))
162 i = rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000163 if i >= 0:
164 path = bpath[:i] + '/' + path
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000165 segments = splitfields(path, '/')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000166 if segments[-1] == '.':
167 segments[-1] = ''
168 while '.' in segments:
169 segments.remove('.')
170 while 1:
171 i = 1
172 n = len(segments) - 1
173 while i < n:
174 if segments[i] == '..' and segments[i-1]:
175 del segments[i-1:i+1]
176 break
177 i = i+1
178 else:
179 break
Guido van Rossume612be51997-12-03 22:38:56 +0000180 if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
181 segments[-1] = ''
182 elif len(segments) >= 2 and segments[-1] == '..':
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000183 segments[-2:] = ['']
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000184 return urlunparse((scheme, netloc, joinfields(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000185 params, query, fragment))
186
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000187def urldefrag(url):
188 """Removes any existing fragment from URL.
189
190 Returns a tuple of the defragmented URL and the fragment. If
191 the URL contained no fragments, the second element is the
192 empty string.
193 """
194 s, n, p, a, q, frag = urlparse(url)
195 defrag = urlunparse((s, n, p, a, q, ''))
196 return defrag, frag
197
198
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000199test_input = """
200 http://a/b/c/d
201
202 g:h = <URL:g:h>
203 http:g = <URL:http://a/b/c/g>
204 http: = <URL:http://a/b/c/d>
205 g = <URL:http://a/b/c/g>
206 ./g = <URL:http://a/b/c/g>
207 g/ = <URL:http://a/b/c/g/>
208 /g = <URL:http://a/g>
209 //g = <URL:http://g>
210 ?y = <URL:http://a/b/c/d?y>
211 g?y = <URL:http://a/b/c/g?y>
212 g?y/./x = <URL:http://a/b/c/g?y/./x>
213 . = <URL:http://a/b/c/>
214 ./ = <URL:http://a/b/c/>
215 .. = <URL:http://a/b/>
216 ../ = <URL:http://a/b/>
217 ../g = <URL:http://a/b/g>
218 ../.. = <URL:http://a/>
219 ../../g = <URL:http://a/g>
220 ../../../g = <URL:http://a/../g>
221 ./../g = <URL:http://a/b/g>
222 ./g/. = <URL:http://a/b/c/g/>
223 /./g = <URL:http://a/./g>
224 g/./h = <URL:http://a/b/c/g/h>
225 g/../h = <URL:http://a/b/c/h>
226 http:g = <URL:http://a/b/c/g>
227 http: = <URL:http://a/b/c/d>
228"""
Guido van Rossumc08cc501998-12-21 18:24:09 +0000229# XXX The result for //g is actually http://g/; is this a problem?
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000230
231def test():
232 import sys
233 base = ''
234 if sys.argv[1:]:
235 fn = sys.argv[1]
236 if fn == '-':
237 fp = sys.stdin
238 else:
239 fp = open(fn)
240 else:
241 import StringIO
242 fp = StringIO.StringIO(test_input)
243 while 1:
244 line = fp.readline()
245 if not line: break
246 words = string.split(line)
247 if not words:
248 continue
249 url = words[0]
250 parts = urlparse(url)
251 print '%-10s : %s' % (url, parts)
252 abs = urljoin(base, url)
253 if not base:
254 base = abs
255 wrapped = '<URL:%s>' % abs
256 print '%-10s = %s' % (url, wrapped)
257 if len(words) == 3 and words[1] == '=':
258 if wrapped != words[2]:
259 print 'EXPECTED', words[2], '!!!!!!!!!!'
260
261if __name__ == '__main__':
262 test()