blob: 31d853a168dda00ae779734f6d8c1778b1806a2a [file] [log] [blame]
Guido van Rossumededb581996-03-29 21:23:25 +00001# Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2# Resource Locators", by R. Fielding, UC Irvine, June 1995.
Guido van Rossum23cb2a81994-09-12 10:36:35 +00003
4# Standard/builtin Python modules
5import string
Guido van Rossum3fd32ec1996-05-28 23:54:24 +00006from string import joinfields, splitfields, find, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007
8# A classification of schemes ('' means apply by default)
9uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Guido van Rossumb02092a1997-01-02 18:18:27 +000010 'https', 'shttp',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000011 'prospero', '']
12uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000013 'https', 'shttp', 'snews',
Guido van Rossum1a16c861995-08-10 19:45:41 +000014 'prospero', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000015non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
16 'snews',
17 ]
18uses_params = ['ftp', 'hdl', 'prospero', 'http',
19 'https', 'shttp',
20 '']
21uses_query = ['http', 'wais',
22 'https', 'shttp',
23 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000024uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossumb02092a1997-01-02 18:18:27 +000025 'https', 'shttp', 'snews',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000026 'file', 'prospero', '']
27
28# Characters valid in scheme names
29scheme_chars = string.letters + string.digits + '+-.'
30
Guido van Rossum671dc201996-12-27 15:26:15 +000031MAX_CACHE_SIZE = 2000
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000032_parse_cache = {}
33
34def clear_cache():
Guido van Rossum671dc201996-12-27 15:26:15 +000035 """Clear the parse cache."""
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000036 global _parse_cache
37 _parse_cache = {}
38
39
Guido van Rossum23cb2a81994-09-12 10:36:35 +000040# Parse a URL into 6 components:
41# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
42# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
43# Note that we don't break the components up in smaller bits
44# (e.g. netloc is a single string) and we don't expand % escapes.
45def urlparse(url, scheme = '', allow_framents = 1):
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000046 key = url, scheme, allow_framents
47 if _parse_cache.has_key(key):
48 return _parse_cache[key]
Guido van Rossum671dc201996-12-27 15:26:15 +000049 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
50 clear_cache()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000051 netloc = path = params = query = fragment = ''
Guido van Rossum23cb2a81994-09-12 10:36:35 +000052 i = string.find(url, ':')
53 if i > 0:
54 for c in url[:i]:
55 if c not in scheme_chars:
56 break
57 else:
58 scheme, url = string.lower(url[:i]), url[i+1:]
59 if scheme in uses_netloc:
60 if url[:2] == '//':
61 i = string.find(url, '/', 2)
62 if i < 0:
63 i = len(url)
64 netloc, url = url[2:i], url[i:]
65 if allow_framents and scheme in uses_fragment:
66 i = string.rfind(url, '#')
67 if i >= 0:
68 url, fragment = url[:i], url[i+1:]
69 if scheme in uses_query:
70 i = string.find(url, '?')
71 if i >= 0:
72 url, query = url[:i], url[i+1:]
73 if scheme in uses_params:
74 i = string.find(url, ';')
75 if i >= 0:
76 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000077 tuple = scheme, netloc, url, params, query, fragment
78 _parse_cache[key] = tuple
79 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000080
81# Put a parsed URL back together again. This may result in a slightly
82# different, but equivalent URL, if the URL that was parsed originally
83# had redundant delimiters, e.g. a ? with an empty query (the draft
84# states that these are equivalent).
85def urlunparse((scheme, netloc, url, params, query, fragment)):
86 if netloc:
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +000087 if url[:1] != '/': url = '/' + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +000088 url = '//' + netloc + url
89 if scheme:
90 url = scheme + ':' + url
91 if params:
92 url = url + ';' + params
93 if query:
94 url = url + '?' + query
95 if fragment:
96 url = url + '#' + fragment
97 return url
98
99# Join a base URL and a possibly relative URL to form an absolute
100# interpretation of the latter.
101def urljoin(base, url, allow_framents = 1):
102 if not base:
103 return url
104 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000105 urlparse(base, '', allow_framents)
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000106 scheme, netloc, path, params, query, fragment = \
107 urlparse(url, bscheme, allow_framents)
Guido van Rossuma1124701994-12-30 17:18:59 +0000108 # XXX Unofficial hack: default netloc to bnetloc even if
109 # schemes differ
110 if scheme != bscheme and not netloc and \
111 scheme in uses_relative and bscheme in uses_relative and \
112 scheme in uses_netloc and bscheme in uses_netloc:
113 netloc = bnetloc
114 # Strip the port number
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000115 i = find(netloc, '@')
Guido van Rossuma1124701994-12-30 17:18:59 +0000116 if i < 0: i = 0
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000117 i = find(netloc, ':', i)
Guido van Rossuma1124701994-12-30 17:18:59 +0000118 if i >= 0:
119 netloc = netloc[:i]
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000120 if scheme != bscheme or scheme not in uses_relative:
121 return urlunparse((scheme, netloc, path,
122 params, query, fragment))
123 if scheme in uses_netloc:
124 if netloc:
125 return urlunparse((scheme, netloc, path,
126 params, query, fragment))
127 netloc = bnetloc
128 if path[:1] == '/':
129 return urlunparse((scheme, netloc, path,
130 params, query, fragment))
131 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000132 return urlunparse((scheme, netloc, bpath,
133 params, query or bquery, fragment))
134 i = rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000135 if i >= 0:
136 path = bpath[:i] + '/' + path
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000137 segments = splitfields(path, '/')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000138 if segments[-1] == '.':
139 segments[-1] = ''
140 while '.' in segments:
141 segments.remove('.')
142 while 1:
143 i = 1
144 n = len(segments) - 1
145 while i < n:
146 if segments[i] == '..' and segments[i-1]:
147 del segments[i-1:i+1]
148 break
149 i = i+1
150 else:
151 break
152 if len(segments) >= 2 and segments[-1] == '..':
153 segments[-2:] = ['']
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000154 return urlunparse((scheme, netloc, joinfields(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000155 params, query, fragment))
156
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000157def urldefrag(url):
158 """Removes any existing fragment from URL.
159
160 Returns a tuple of the defragmented URL and the fragment. If
161 the URL contained no fragments, the second element is the
162 empty string.
163 """
164 s, n, p, a, q, frag = urlparse(url)
165 defrag = urlunparse((s, n, p, a, q, ''))
166 return defrag, frag
167
168
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000169test_input = """
170 http://a/b/c/d
171
172 g:h = <URL:g:h>
173 http:g = <URL:http://a/b/c/g>
174 http: = <URL:http://a/b/c/d>
175 g = <URL:http://a/b/c/g>
176 ./g = <URL:http://a/b/c/g>
177 g/ = <URL:http://a/b/c/g/>
178 /g = <URL:http://a/g>
179 //g = <URL:http://g>
180 ?y = <URL:http://a/b/c/d?y>
181 g?y = <URL:http://a/b/c/g?y>
182 g?y/./x = <URL:http://a/b/c/g?y/./x>
183 . = <URL:http://a/b/c/>
184 ./ = <URL:http://a/b/c/>
185 .. = <URL:http://a/b/>
186 ../ = <URL:http://a/b/>
187 ../g = <URL:http://a/b/g>
188 ../.. = <URL:http://a/>
189 ../../g = <URL:http://a/g>
190 ../../../g = <URL:http://a/../g>
191 ./../g = <URL:http://a/b/g>
192 ./g/. = <URL:http://a/b/c/g/>
193 /./g = <URL:http://a/./g>
194 g/./h = <URL:http://a/b/c/g/h>
195 g/../h = <URL:http://a/b/c/h>
196 http:g = <URL:http://a/b/c/g>
197 http: = <URL:http://a/b/c/d>
198"""
199
200def test():
201 import sys
202 base = ''
203 if sys.argv[1:]:
204 fn = sys.argv[1]
205 if fn == '-':
206 fp = sys.stdin
207 else:
208 fp = open(fn)
209 else:
210 import StringIO
211 fp = StringIO.StringIO(test_input)
212 while 1:
213 line = fp.readline()
214 if not line: break
215 words = string.split(line)
216 if not words:
217 continue
218 url = words[0]
219 parts = urlparse(url)
220 print '%-10s : %s' % (url, parts)
221 abs = urljoin(base, url)
222 if not base:
223 base = abs
224 wrapped = '<URL:%s>' % abs
225 print '%-10s = %s' % (url, wrapped)
226 if len(words) == 3 and words[1] == '=':
227 if wrapped != words[2]:
228 print 'EXPECTED', words[2], '!!!!!!!!!!'
229
230if __name__ == '__main__':
231 test()