blob: 22a5fd0e901d4094a9319108529106f1f57f027f [file] [log] [blame]
Guido van Rossumededb581996-03-29 21:23:25 +00001# Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2# Resource Locators", by R. Fielding, UC Irvine, June 1995.
Guido van Rossum23cb2a81994-09-12 10:36:35 +00003
4# Standard/builtin Python modules
5import string
Guido van Rossum3fd32ec1996-05-28 23:54:24 +00006from string import joinfields, splitfields, find, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007
8# A classification of schemes ('' means apply by default)
9uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000010 'https', 'shttp',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000011 'prospero', '']
12uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000013 'https', 'shttp', 'snews',
Guido van Rossum1a16c861995-08-10 19:45:41 +000014 'prospero', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000015non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
16 'snews',
17 ]
18uses_params = ['ftp', 'hdl', 'prospero', 'http',
19 'https', 'shttp',
20 '']
21uses_query = ['http', 'wais',
22 'https', 'shttp',
23 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000024uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000025 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000026 'file', 'prospero', '']
27
28# Characters valid in scheme names
29scheme_chars = string.letters + string.digits + '+-.'
30
Guido van Rossum74495401997-07-14 19:08:15 +000031MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000032_parse_cache = {}
33
34def clear_cache():
Guido van Rossum671dc201996-12-27 15:26:15 +000035 """Clear the parse cache."""
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000036 global _parse_cache
37 _parse_cache = {}
38
39
Guido van Rossum23cb2a81994-09-12 10:36:35 +000040# Parse a URL into 6 components:
41# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
42# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
43# Note that we don't break the components up in smaller bits
44# (e.g. netloc is a single string) and we don't expand % escapes.
45def urlparse(url, scheme = '', allow_framents = 1):
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000046 key = url, scheme, allow_framents
Guido van Rossum185147f1997-07-11 20:13:10 +000047 try:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000048 return _parse_cache[key]
Guido van Rossum185147f1997-07-11 20:13:10 +000049 except KeyError:
50 pass
Guido van Rossum671dc201996-12-27 15:26:15 +000051 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
52 clear_cache()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000053 netloc = path = params = query = fragment = ''
Guido van Rossum23cb2a81994-09-12 10:36:35 +000054 i = string.find(url, ':')
55 if i > 0:
56 for c in url[:i]:
57 if c not in scheme_chars:
58 break
59 else:
60 scheme, url = string.lower(url[:i]), url[i+1:]
61 if scheme in uses_netloc:
62 if url[:2] == '//':
63 i = string.find(url, '/', 2)
64 if i < 0:
65 i = len(url)
66 netloc, url = url[2:i], url[i:]
67 if allow_framents and scheme in uses_fragment:
68 i = string.rfind(url, '#')
69 if i >= 0:
70 url, fragment = url[:i], url[i+1:]
71 if scheme in uses_query:
72 i = string.find(url, '?')
73 if i >= 0:
74 url, query = url[:i], url[i+1:]
75 if scheme in uses_params:
76 i = string.find(url, ';')
77 if i >= 0:
78 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000079 tuple = scheme, netloc, url, params, query, fragment
80 _parse_cache[key] = tuple
81 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000082
83# Put a parsed URL back together again. This may result in a slightly
84# different, but equivalent URL, if the URL that was parsed originally
85# had redundant delimiters, e.g. a ? with an empty query (the draft
86# states that these are equivalent).
87def urlunparse((scheme, netloc, url, params, query, fragment)):
88 if netloc:
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +000089 if url[:1] != '/': url = '/' + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +000090 url = '//' + netloc + url
91 if scheme:
92 url = scheme + ':' + url
93 if params:
94 url = url + ';' + params
95 if query:
96 url = url + '?' + query
97 if fragment:
98 url = url + '#' + fragment
99 return url
100
101# Join a base URL and a possibly relative URL to form an absolute
102# interpretation of the latter.
103def urljoin(base, url, allow_framents = 1):
104 if not base:
105 return url
106 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000107 urlparse(base, '', allow_framents)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000108 scheme, netloc, path, params, query, fragment = \
109 urlparse(url, bscheme, allow_framents)
Guido van Rossuma1124701994-12-30 17:18:59 +0000110 # XXX Unofficial hack: default netloc to bnetloc even if
111 # schemes differ
112 if scheme != bscheme and not netloc and \
113 scheme in uses_relative and bscheme in uses_relative and \
114 scheme in uses_netloc and bscheme in uses_netloc:
115 netloc = bnetloc
116 # Strip the port number
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000117 i = find(netloc, '@')
Guido van Rossuma1124701994-12-30 17:18:59 +0000118 if i < 0: i = 0
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000119 i = find(netloc, ':', i)
Guido van Rossuma1124701994-12-30 17:18:59 +0000120 if i >= 0:
121 netloc = netloc[:i]
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000122 if scheme != bscheme or scheme not in uses_relative:
123 return urlunparse((scheme, netloc, path,
124 params, query, fragment))
125 if scheme in uses_netloc:
126 if netloc:
127 return urlunparse((scheme, netloc, path,
128 params, query, fragment))
129 netloc = bnetloc
130 if path[:1] == '/':
131 return urlunparse((scheme, netloc, path,
132 params, query, fragment))
133 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000134 return urlunparse((scheme, netloc, bpath,
135 params, query or bquery, fragment))
136 i = rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000137 if i >= 0:
138 path = bpath[:i] + '/' + path
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000139 segments = splitfields(path, '/')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000140 if segments[-1] == '.':
141 segments[-1] = ''
142 while '.' in segments:
143 segments.remove('.')
144 while 1:
145 i = 1
146 n = len(segments) - 1
147 while i < n:
148 if segments[i] == '..' and segments[i-1]:
149 del segments[i-1:i+1]
150 break
151 i = i+1
152 else:
153 break
154 if len(segments) >= 2 and segments[-1] == '..':
155 segments[-2:] = ['']
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000156 return urlunparse((scheme, netloc, joinfields(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000157 params, query, fragment))
158
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000159def urldefrag(url):
160 """Removes any existing fragment from URL.
161
162 Returns a tuple of the defragmented URL and the fragment. If
163 the URL contained no fragments, the second element is the
164 empty string.
165 """
166 s, n, p, a, q, frag = urlparse(url)
167 defrag = urlunparse((s, n, p, a, q, ''))
168 return defrag, frag
169
170
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000171test_input = """
172 http://a/b/c/d
173
174 g:h = <URL:g:h>
175 http:g = <URL:http://a/b/c/g>
176 http: = <URL:http://a/b/c/d>
177 g = <URL:http://a/b/c/g>
178 ./g = <URL:http://a/b/c/g>
179 g/ = <URL:http://a/b/c/g/>
180 /g = <URL:http://a/g>
181 //g = <URL:http://g>
182 ?y = <URL:http://a/b/c/d?y>
183 g?y = <URL:http://a/b/c/g?y>
184 g?y/./x = <URL:http://a/b/c/g?y/./x>
185 . = <URL:http://a/b/c/>
186 ./ = <URL:http://a/b/c/>
187 .. = <URL:http://a/b/>
188 ../ = <URL:http://a/b/>
189 ../g = <URL:http://a/b/g>
190 ../.. = <URL:http://a/>
191 ../../g = <URL:http://a/g>
192 ../../../g = <URL:http://a/../g>
193 ./../g = <URL:http://a/b/g>
194 ./g/. = <URL:http://a/b/c/g/>
195 /./g = <URL:http://a/./g>
196 g/./h = <URL:http://a/b/c/g/h>
197 g/../h = <URL:http://a/b/c/h>
198 http:g = <URL:http://a/b/c/g>
199 http: = <URL:http://a/b/c/d>
200"""
201
202def test():
203 import sys
204 base = ''
205 if sys.argv[1:]:
206 fn = sys.argv[1]
207 if fn == '-':
208 fp = sys.stdin
209 else:
210 fp = open(fn)
211 else:
212 import StringIO
213 fp = StringIO.StringIO(test_input)
214 while 1:
215 line = fp.readline()
216 if not line: break
217 words = string.split(line)
218 if not words:
219 continue
220 url = words[0]
221 parts = urlparse(url)
222 print '%-10s : %s' % (url, parts)
223 abs = urljoin(base, url)
224 if not base:
225 base = abs
226 wrapped = '<URL:%s>' % abs
227 print '%-10s = %s' % (url, wrapped)
228 if len(words) == 3 and words[1] == '=':
229 if wrapped != words[2]:
230 print 'EXPECTED', words[2], '!!!!!!!!!!'
231
232if __name__ == '__main__':
233 test()