blob: 560028df94c45d071c5595ae600274eb456e78f6 [file] [log] [blame]
Guido van Rossumededb581996-03-29 21:23:25 +00001# Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2# Resource Locators", by R. Fielding, UC Irvine, June 1995.
Guido van Rossum23cb2a81994-09-12 10:36:35 +00003
4# Standard/builtin Python modules
5import string
Guido van Rossum3fd32ec1996-05-28 23:54:24 +00006from string import joinfields, splitfields, find, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007
8# A classification of schemes ('' means apply by default)
9uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000010 'https', 'shttp',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000011 'prospero', '']
12uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000013 'https', 'shttp', 'snews',
Guido van Rossum1a16c861995-08-10 19:45:41 +000014 'prospero', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000015non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
16 'snews',
17 ]
18uses_params = ['ftp', 'hdl', 'prospero', 'http',
19 'https', 'shttp',
20 '']
21uses_query = ['http', 'wais',
22 'https', 'shttp',
23 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000024uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000025 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000026 'file', 'prospero', '']
27
28# Characters valid in scheme names
29scheme_chars = string.letters + string.digits + '+-.'
30
Guido van Rossum74495401997-07-14 19:08:15 +000031MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000032_parse_cache = {}
33
34def clear_cache():
Guido van Rossum671dc201996-12-27 15:26:15 +000035 """Clear the parse cache."""
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000036 global _parse_cache
37 _parse_cache = {}
38
39
Guido van Rossum23cb2a81994-09-12 10:36:35 +000040# Parse a URL into 6 components:
41# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
42# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
43# Note that we don't break the components up in smaller bits
44# (e.g. netloc is a single string) and we don't expand % escapes.
45def urlparse(url, scheme = '', allow_framents = 1):
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000046 key = url, scheme, allow_framents
Guido van Rossum185147f1997-07-11 20:13:10 +000047 try:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000048 return _parse_cache[key]
Guido van Rossum185147f1997-07-11 20:13:10 +000049 except KeyError:
50 pass
Guido van Rossum671dc201996-12-27 15:26:15 +000051 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
52 clear_cache()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000053 netloc = path = params = query = fragment = ''
Guido van Rossum23cb2a81994-09-12 10:36:35 +000054 i = string.find(url, ':')
55 if i > 0:
56 for c in url[:i]:
57 if c not in scheme_chars:
58 break
59 else:
60 scheme, url = string.lower(url[:i]), url[i+1:]
61 if scheme in uses_netloc:
62 if url[:2] == '//':
63 i = string.find(url, '/', 2)
64 if i < 0:
65 i = len(url)
66 netloc, url = url[2:i], url[i:]
67 if allow_framents and scheme in uses_fragment:
68 i = string.rfind(url, '#')
69 if i >= 0:
70 url, fragment = url[:i], url[i+1:]
71 if scheme in uses_query:
72 i = string.find(url, '?')
73 if i >= 0:
74 url, query = url[:i], url[i+1:]
75 if scheme in uses_params:
76 i = string.find(url, ';')
77 if i >= 0:
78 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000079 tuple = scheme, netloc, url, params, query, fragment
80 _parse_cache[key] = tuple
81 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000082
83# Put a parsed URL back together again. This may result in a slightly
84# different, but equivalent URL, if the URL that was parsed originally
85# had redundant delimiters, e.g. a ? with an empty query (the draft
86# states that these are equivalent).
87def urlunparse((scheme, netloc, url, params, query, fragment)):
88 if netloc:
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +000089 if url[:1] != '/': url = '/' + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +000090 url = '//' + netloc + url
91 if scheme:
92 url = scheme + ':' + url
93 if params:
94 url = url + ';' + params
95 if query:
96 url = url + '?' + query
97 if fragment:
98 url = url + '#' + fragment
99 return url
100
101# Join a base URL and a possibly relative URL to form an absolute
102# interpretation of the latter.
103def urljoin(base, url, allow_framents = 1):
104 if not base:
105 return url
106 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000107 urlparse(base, '', allow_framents)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000108 scheme, netloc, path, params, query, fragment = \
109 urlparse(url, bscheme, allow_framents)
Guido van Rossuma1124701994-12-30 17:18:59 +0000110 # XXX Unofficial hack: default netloc to bnetloc even if
111 # schemes differ
112 if scheme != bscheme and not netloc and \
113 scheme in uses_relative and bscheme in uses_relative and \
114 scheme in uses_netloc and bscheme in uses_netloc:
115 netloc = bnetloc
116 # Strip the port number
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000117 i = find(netloc, '@')
Guido van Rossuma1124701994-12-30 17:18:59 +0000118 if i < 0: i = 0
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000119 i = find(netloc, ':', i)
Guido van Rossuma1124701994-12-30 17:18:59 +0000120 if i >= 0:
121 netloc = netloc[:i]
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000122 if scheme != bscheme or scheme not in uses_relative:
123 return urlunparse((scheme, netloc, path,
124 params, query, fragment))
125 if scheme in uses_netloc:
126 if netloc:
127 return urlunparse((scheme, netloc, path,
128 params, query, fragment))
129 netloc = bnetloc
130 if path[:1] == '/':
131 return urlunparse((scheme, netloc, path,
132 params, query, fragment))
133 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000134 return urlunparse((scheme, netloc, bpath,
135 params, query or bquery, fragment))
136 i = rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000137 if i >= 0:
138 path = bpath[:i] + '/' + path
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000139 segments = splitfields(path, '/')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000140 if segments[-1] == '.':
141 segments[-1] = ''
142 while '.' in segments:
143 segments.remove('.')
144 while 1:
145 i = 1
146 n = len(segments) - 1
147 while i < n:
148 if segments[i] == '..' and segments[i-1]:
149 del segments[i-1:i+1]
150 break
151 i = i+1
152 else:
153 break
Guido van Rossume612be51997-12-03 22:38:56 +0000154 if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
155 segments[-1] = ''
156 elif len(segments) >= 2 and segments[-1] == '..':
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000157 segments[-2:] = ['']
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000158 return urlunparse((scheme, netloc, joinfields(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000159 params, query, fragment))
160
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000161def urldefrag(url):
162 """Removes any existing fragment from URL.
163
164 Returns a tuple of the defragmented URL and the fragment. If
165 the URL contained no fragments, the second element is the
166 empty string.
167 """
168 s, n, p, a, q, frag = urlparse(url)
169 defrag = urlunparse((s, n, p, a, q, ''))
170 return defrag, frag
171
172
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000173test_input = """
174 http://a/b/c/d
175
176 g:h = <URL:g:h>
177 http:g = <URL:http://a/b/c/g>
178 http: = <URL:http://a/b/c/d>
179 g = <URL:http://a/b/c/g>
180 ./g = <URL:http://a/b/c/g>
181 g/ = <URL:http://a/b/c/g/>
182 /g = <URL:http://a/g>
183 //g = <URL:http://g>
184 ?y = <URL:http://a/b/c/d?y>
185 g?y = <URL:http://a/b/c/g?y>
186 g?y/./x = <URL:http://a/b/c/g?y/./x>
187 . = <URL:http://a/b/c/>
188 ./ = <URL:http://a/b/c/>
189 .. = <URL:http://a/b/>
190 ../ = <URL:http://a/b/>
191 ../g = <URL:http://a/b/g>
192 ../.. = <URL:http://a/>
193 ../../g = <URL:http://a/g>
194 ../../../g = <URL:http://a/../g>
195 ./../g = <URL:http://a/b/g>
196 ./g/. = <URL:http://a/b/c/g/>
197 /./g = <URL:http://a/./g>
198 g/./h = <URL:http://a/b/c/g/h>
199 g/../h = <URL:http://a/b/c/h>
200 http:g = <URL:http://a/b/c/g>
201 http: = <URL:http://a/b/c/d>
202"""
203
204def test():
205 import sys
206 base = ''
207 if sys.argv[1:]:
208 fn = sys.argv[1]
209 if fn == '-':
210 fp = sys.stdin
211 else:
212 fp = open(fn)
213 else:
214 import StringIO
215 fp = StringIO.StringIO(test_input)
216 while 1:
217 line = fp.readline()
218 if not line: break
219 words = string.split(line)
220 if not words:
221 continue
222 url = words[0]
223 parts = urlparse(url)
224 print '%-10s : %s' % (url, parts)
225 abs = urljoin(base, url)
226 if not base:
227 base = abs
228 wrapped = '<URL:%s>' % abs
229 print '%-10s = %s' % (url, wrapped)
230 if len(words) == 3 and words[1] == '=':
231 if wrapped != words[2]:
232 print 'EXPECTED', words[2], '!!!!!!!!!!'
233
234if __name__ == '__main__':
235 test()