blob: 185eb7fd6efa4faee194ac3d20d450fabcb65cef [file] [log] [blame]
Guido van Rossumededb581996-03-29 21:23:25 +00001# Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2# Resource Locators", by R. Fielding, UC Irvine, June 1995.
Guido van Rossum23cb2a81994-09-12 10:36:35 +00003
4# Standard/builtin Python modules
5import string
Guido van Rossum3fd32ec1996-05-28 23:54:24 +00006from string import joinfields, splitfields, find, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007
8# A classification of schemes ('' means apply by default)
9uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000010 'https', 'shttp',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000011 'prospero', '']
12uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000013 'https', 'shttp', 'snews',
Guido van Rossum1a16c861995-08-10 19:45:41 +000014 'prospero', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000015non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
16 'snews',
17 ]
18uses_params = ['ftp', 'hdl', 'prospero', 'http',
19 'https', 'shttp',
20 '']
21uses_query = ['http', 'wais',
22 'https', 'shttp',
Guido van Rossumf7edadb1998-01-19 22:27:21 +000023 'gopher',
Guido van Rossumb02092a1997-01-02 18:18:27 +000024 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000025uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000026 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000027 'file', 'prospero', '']
28
29# Characters valid in scheme names
30scheme_chars = string.letters + string.digits + '+-.'
31
Guido van Rossum74495401997-07-14 19:08:15 +000032MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000033_parse_cache = {}
34
35def clear_cache():
Guido van Rossum671dc201996-12-27 15:26:15 +000036 """Clear the parse cache."""
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000037 global _parse_cache
38 _parse_cache = {}
39
40
Guido van Rossum23cb2a81994-09-12 10:36:35 +000041# Parse a URL into 6 components:
42# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
43# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
44# Note that we don't break the components up in smaller bits
45# (e.g. netloc is a single string) and we don't expand % escapes.
46def urlparse(url, scheme = '', allow_framents = 1):
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000047 key = url, scheme, allow_framents
Guido van Rossum185147f1997-07-11 20:13:10 +000048 try:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000049 return _parse_cache[key]
Guido van Rossum185147f1997-07-11 20:13:10 +000050 except KeyError:
51 pass
Guido van Rossum671dc201996-12-27 15:26:15 +000052 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
53 clear_cache()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000054 netloc = path = params = query = fragment = ''
Guido van Rossum23cb2a81994-09-12 10:36:35 +000055 i = string.find(url, ':')
56 if i > 0:
57 for c in url[:i]:
58 if c not in scheme_chars:
59 break
60 else:
61 scheme, url = string.lower(url[:i]), url[i+1:]
62 if scheme in uses_netloc:
63 if url[:2] == '//':
64 i = string.find(url, '/', 2)
65 if i < 0:
66 i = len(url)
67 netloc, url = url[2:i], url[i:]
68 if allow_framents and scheme in uses_fragment:
69 i = string.rfind(url, '#')
70 if i >= 0:
71 url, fragment = url[:i], url[i+1:]
72 if scheme in uses_query:
73 i = string.find(url, '?')
74 if i >= 0:
75 url, query = url[:i], url[i+1:]
76 if scheme in uses_params:
77 i = string.find(url, ';')
78 if i >= 0:
79 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000080 tuple = scheme, netloc, url, params, query, fragment
81 _parse_cache[key] = tuple
82 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000083
84# Put a parsed URL back together again. This may result in a slightly
85# different, but equivalent URL, if the URL that was parsed originally
86# had redundant delimiters, e.g. a ? with an empty query (the draft
87# states that these are equivalent).
88def urlunparse((scheme, netloc, url, params, query, fragment)):
89 if netloc:
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +000090 if url[:1] != '/': url = '/' + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +000091 url = '//' + netloc + url
92 if scheme:
93 url = scheme + ':' + url
94 if params:
95 url = url + ';' + params
96 if query:
97 url = url + '?' + query
98 if fragment:
99 url = url + '#' + fragment
100 return url
101
102# Join a base URL and a possibly relative URL to form an absolute
103# interpretation of the latter.
104def urljoin(base, url, allow_framents = 1):
105 if not base:
106 return url
107 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000108 urlparse(base, '', allow_framents)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000109 scheme, netloc, path, params, query, fragment = \
110 urlparse(url, bscheme, allow_framents)
Guido van Rossuma1124701994-12-30 17:18:59 +0000111 # XXX Unofficial hack: default netloc to bnetloc even if
112 # schemes differ
113 if scheme != bscheme and not netloc and \
114 scheme in uses_relative and bscheme in uses_relative and \
115 scheme in uses_netloc and bscheme in uses_netloc:
116 netloc = bnetloc
117 # Strip the port number
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000118 i = find(netloc, '@')
Guido van Rossuma1124701994-12-30 17:18:59 +0000119 if i < 0: i = 0
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000120 i = find(netloc, ':', i)
Guido van Rossuma1124701994-12-30 17:18:59 +0000121 if i >= 0:
122 netloc = netloc[:i]
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000123 if scheme != bscheme or scheme not in uses_relative:
124 return urlunparse((scheme, netloc, path,
125 params, query, fragment))
126 if scheme in uses_netloc:
127 if netloc:
128 return urlunparse((scheme, netloc, path,
129 params, query, fragment))
130 netloc = bnetloc
131 if path[:1] == '/':
132 return urlunparse((scheme, netloc, path,
133 params, query, fragment))
134 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000135 return urlunparse((scheme, netloc, bpath,
136 params, query or bquery, fragment))
137 i = rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000138 if i >= 0:
139 path = bpath[:i] + '/' + path
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000140 segments = splitfields(path, '/')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000141 if segments[-1] == '.':
142 segments[-1] = ''
143 while '.' in segments:
144 segments.remove('.')
145 while 1:
146 i = 1
147 n = len(segments) - 1
148 while i < n:
149 if segments[i] == '..' and segments[i-1]:
150 del segments[i-1:i+1]
151 break
152 i = i+1
153 else:
154 break
Guido van Rossume612be51997-12-03 22:38:56 +0000155 if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
156 segments[-1] = ''
157 elif len(segments) >= 2 and segments[-1] == '..':
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000158 segments[-2:] = ['']
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000159 return urlunparse((scheme, netloc, joinfields(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000160 params, query, fragment))
161
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000162def urldefrag(url):
163 """Removes any existing fragment from URL.
164
165 Returns a tuple of the defragmented URL and the fragment. If
166 the URL contained no fragments, the second element is the
167 empty string.
168 """
169 s, n, p, a, q, frag = urlparse(url)
170 defrag = urlunparse((s, n, p, a, q, ''))
171 return defrag, frag
172
173
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000174test_input = """
175 http://a/b/c/d
176
177 g:h = <URL:g:h>
178 http:g = <URL:http://a/b/c/g>
179 http: = <URL:http://a/b/c/d>
180 g = <URL:http://a/b/c/g>
181 ./g = <URL:http://a/b/c/g>
182 g/ = <URL:http://a/b/c/g/>
183 /g = <URL:http://a/g>
184 //g = <URL:http://g>
185 ?y = <URL:http://a/b/c/d?y>
186 g?y = <URL:http://a/b/c/g?y>
187 g?y/./x = <URL:http://a/b/c/g?y/./x>
188 . = <URL:http://a/b/c/>
189 ./ = <URL:http://a/b/c/>
190 .. = <URL:http://a/b/>
191 ../ = <URL:http://a/b/>
192 ../g = <URL:http://a/b/g>
193 ../.. = <URL:http://a/>
194 ../../g = <URL:http://a/g>
195 ../../../g = <URL:http://a/../g>
196 ./../g = <URL:http://a/b/g>
197 ./g/. = <URL:http://a/b/c/g/>
198 /./g = <URL:http://a/./g>
199 g/./h = <URL:http://a/b/c/g/h>
200 g/../h = <URL:http://a/b/c/h>
201 http:g = <URL:http://a/b/c/g>
202 http: = <URL:http://a/b/c/d>
203"""
204
205def test():
206 import sys
207 base = ''
208 if sys.argv[1:]:
209 fn = sys.argv[1]
210 if fn == '-':
211 fp = sys.stdin
212 else:
213 fp = open(fn)
214 else:
215 import StringIO
216 fp = StringIO.StringIO(test_input)
217 while 1:
218 line = fp.readline()
219 if not line: break
220 words = string.split(line)
221 if not words:
222 continue
223 url = words[0]
224 parts = urlparse(url)
225 print '%-10s : %s' % (url, parts)
226 abs = urljoin(base, url)
227 if not base:
228 base = abs
229 wrapped = '<URL:%s>' % abs
230 print '%-10s = %s' % (url, wrapped)
231 if len(words) == 3 and words[1] == '=':
232 if wrapped != words[2]:
233 print 'EXPECTED', words[2], '!!!!!!!!!!'
234
235if __name__ == '__main__':
236 test()