blob: 91775333c7328177e6de77cdf968bfee0fcf5153 [file] [log] [blame]
Guido van Rossumededb581996-03-29 21:23:25 +00001# Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2# Resource Locators", by R. Fielding, UC Irvine, June 1995.
Guido van Rossum23cb2a81994-09-12 10:36:35 +00003
4# Standard/builtin Python modules
5import string
Guido van Rossum3fd32ec1996-05-28 23:54:24 +00006from string import joinfields, splitfields, find, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007
8# A classification of schemes ('' means apply by default)
9uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
10 'prospero', '']
11uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossum1a16c861995-08-10 19:45:41 +000012 'prospero', '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000013non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais']
14uses_params = ['ftp', 'hdl', 'prospero', 'http', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000015uses_query = ['http', 'wais', '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000016uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000017 'file', 'prospero', '']
18
19# Characters valid in scheme names
20scheme_chars = string.letters + string.digits + '+-.'
21
Guido van Rossum671dc201996-12-27 15:26:15 +000022MAX_CACHE_SIZE = 2000
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000023_parse_cache = {}
24
25def clear_cache():
Guido van Rossum671dc201996-12-27 15:26:15 +000026 """Clear the parse cache."""
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000027 global _parse_cache
28 _parse_cache = {}
29
30
Guido van Rossum23cb2a81994-09-12 10:36:35 +000031# Parse a URL into 6 components:
32# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
33# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
34# Note that we don't break the components up in smaller bits
35# (e.g. netloc is a single string) and we don't expand % escapes.
36def urlparse(url, scheme = '', allow_framents = 1):
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000037 key = url, scheme, allow_framents
38 if _parse_cache.has_key(key):
39 return _parse_cache[key]
Guido van Rossum671dc201996-12-27 15:26:15 +000040 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
41 clear_cache()
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000042 netloc = path = params = query = fragment = ''
Guido van Rossum23cb2a81994-09-12 10:36:35 +000043 i = string.find(url, ':')
44 if i > 0:
45 for c in url[:i]:
46 if c not in scheme_chars:
47 break
48 else:
49 scheme, url = string.lower(url[:i]), url[i+1:]
50 if scheme in uses_netloc:
51 if url[:2] == '//':
52 i = string.find(url, '/', 2)
53 if i < 0:
54 i = len(url)
55 netloc, url = url[2:i], url[i:]
56 if allow_framents and scheme in uses_fragment:
57 i = string.rfind(url, '#')
58 if i >= 0:
59 url, fragment = url[:i], url[i+1:]
60 if scheme in uses_query:
61 i = string.find(url, '?')
62 if i >= 0:
63 url, query = url[:i], url[i+1:]
64 if scheme in uses_params:
65 i = string.find(url, ';')
66 if i >= 0:
67 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000068 tuple = scheme, netloc, url, params, query, fragment
69 _parse_cache[key] = tuple
70 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000071
72# Put a parsed URL back together again. This may result in a slightly
73# different, but equivalent URL, if the URL that was parsed originally
74# had redundant delimiters, e.g. a ? with an empty query (the draft
75# states that these are equivalent).
76def urlunparse((scheme, netloc, url, params, query, fragment)):
77 if netloc:
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +000078 if url[:1] != '/': url = '/' + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +000079 url = '//' + netloc + url
80 if scheme:
81 url = scheme + ':' + url
82 if params:
83 url = url + ';' + params
84 if query:
85 url = url + '?' + query
86 if fragment:
87 url = url + '#' + fragment
88 return url
89
90# Join a base URL and a possibly relative URL to form an absolute
91# interpretation of the latter.
92def urljoin(base, url, allow_framents = 1):
93 if not base:
94 return url
95 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000096 urlparse(base, '', allow_framents)
Guido van Rossum23cb2a81994-09-12 10:36:35 +000097 scheme, netloc, path, params, query, fragment = \
98 urlparse(url, bscheme, allow_framents)
Guido van Rossuma1124701994-12-30 17:18:59 +000099 # XXX Unofficial hack: default netloc to bnetloc even if
100 # schemes differ
101 if scheme != bscheme and not netloc and \
102 scheme in uses_relative and bscheme in uses_relative and \
103 scheme in uses_netloc and bscheme in uses_netloc:
104 netloc = bnetloc
105 # Strip the port number
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000106 i = find(netloc, '@')
Guido van Rossuma1124701994-12-30 17:18:59 +0000107 if i < 0: i = 0
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000108 i = find(netloc, ':', i)
Guido van Rossuma1124701994-12-30 17:18:59 +0000109 if i >= 0:
110 netloc = netloc[:i]
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000111 if scheme != bscheme or scheme not in uses_relative:
112 return urlunparse((scheme, netloc, path,
113 params, query, fragment))
114 if scheme in uses_netloc:
115 if netloc:
116 return urlunparse((scheme, netloc, path,
117 params, query, fragment))
118 netloc = bnetloc
119 if path[:1] == '/':
120 return urlunparse((scheme, netloc, path,
121 params, query, fragment))
122 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000123 return urlunparse((scheme, netloc, bpath,
124 params, query or bquery, fragment))
125 i = rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000126 if i >= 0:
127 path = bpath[:i] + '/' + path
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000128 segments = splitfields(path, '/')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000129 if segments[-1] == '.':
130 segments[-1] = ''
131 while '.' in segments:
132 segments.remove('.')
133 while 1:
134 i = 1
135 n = len(segments) - 1
136 while i < n:
137 if segments[i] == '..' and segments[i-1]:
138 del segments[i-1:i+1]
139 break
140 i = i+1
141 else:
142 break
143 if len(segments) >= 2 and segments[-1] == '..':
144 segments[-2:] = ['']
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000145 return urlunparse((scheme, netloc, joinfields(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000146 params, query, fragment))
147
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000148def urldefrag(url):
149 """Removes any existing fragment from URL.
150
151 Returns a tuple of the defragmented URL and the fragment. If
152 the URL contained no fragments, the second element is the
153 empty string.
154 """
155 s, n, p, a, q, frag = urlparse(url)
156 defrag = urlunparse((s, n, p, a, q, ''))
157 return defrag, frag
158
159
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000160test_input = """
161 http://a/b/c/d
162
163 g:h = <URL:g:h>
164 http:g = <URL:http://a/b/c/g>
165 http: = <URL:http://a/b/c/d>
166 g = <URL:http://a/b/c/g>
167 ./g = <URL:http://a/b/c/g>
168 g/ = <URL:http://a/b/c/g/>
169 /g = <URL:http://a/g>
170 //g = <URL:http://g>
171 ?y = <URL:http://a/b/c/d?y>
172 g?y = <URL:http://a/b/c/g?y>
173 g?y/./x = <URL:http://a/b/c/g?y/./x>
174 . = <URL:http://a/b/c/>
175 ./ = <URL:http://a/b/c/>
176 .. = <URL:http://a/b/>
177 ../ = <URL:http://a/b/>
178 ../g = <URL:http://a/b/g>
179 ../.. = <URL:http://a/>
180 ../../g = <URL:http://a/g>
181 ../../../g = <URL:http://a/../g>
182 ./../g = <URL:http://a/b/g>
183 ./g/. = <URL:http://a/b/c/g/>
184 /./g = <URL:http://a/./g>
185 g/./h = <URL:http://a/b/c/g/h>
186 g/../h = <URL:http://a/b/c/h>
187 http:g = <URL:http://a/b/c/g>
188 http: = <URL:http://a/b/c/d>
189"""
190
191def test():
192 import sys
193 base = ''
194 if sys.argv[1:]:
195 fn = sys.argv[1]
196 if fn == '-':
197 fp = sys.stdin
198 else:
199 fp = open(fn)
200 else:
201 import StringIO
202 fp = StringIO.StringIO(test_input)
203 while 1:
204 line = fp.readline()
205 if not line: break
206 words = string.split(line)
207 if not words:
208 continue
209 url = words[0]
210 parts = urlparse(url)
211 print '%-10s : %s' % (url, parts)
212 abs = urljoin(base, url)
213 if not base:
214 base = abs
215 wrapped = '<URL:%s>' % abs
216 print '%-10s = %s' % (url, wrapped)
217 if len(words) == 3 and words[1] == '=':
218 if wrapped != words[2]:
219 print 'EXPECTED', words[2], '!!!!!!!!!!'
220
221if __name__ == '__main__':
222 test()