blob: 1df83d68d31b13b895c069657e149271cbf2fcac [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +00006
Skip Montanaro40fc1602001-03-01 04:27:19 +00007__all__ = ["urlparse", "urlunparse", "urljoin"]
8
Guido van Rossum23cb2a81994-09-12 10:36:35 +00009# A classification of schemes ('' means apply by default)
10uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Tim Peterse1190062001-01-15 03:34:38 +000011 'https', 'shttp',
12 'prospero', 'rtsp', 'rtspu', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000013uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Tim Peterse1190062001-01-15 03:34:38 +000014 'file',
15 'https', 'shttp', 'snews',
16 'prospero', 'rtsp', 'rtspu', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000017non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
Tim Peterse1190062001-01-15 03:34:38 +000018 'snews', 'sip',
19 ]
Guido van Rossumb02092a1997-01-02 18:18:27 +000020uses_params = ['ftp', 'hdl', 'prospero', 'http',
Tim Peterse1190062001-01-15 03:34:38 +000021 'https', 'shttp', 'rtsp', 'rtspu', 'sip',
22 '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000023uses_query = ['http', 'wais',
Tim Peterse1190062001-01-15 03:34:38 +000024 'https', 'shttp',
25 'gopher', 'rtsp', 'rtspu', 'sip',
26 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000027uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Tim Peterse1190062001-01-15 03:34:38 +000028 'https', 'shttp', 'snews',
29 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000030
31# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000032scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
33 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
34 '0123456789'
35 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000036
Guido van Rossum74495401997-07-14 19:08:15 +000037MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000038_parse_cache = {}
39
40def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000041 """Clear the parse cache."""
42 global _parse_cache
43 _parse_cache = {}
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000044
45
Jeremy Hylton4722da61998-08-25 19:45:24 +000046def urlparse(url, scheme = '', allow_fragments = 1):
Tim Peterse1190062001-01-15 03:34:38 +000047 """Parse a URL into 6 components:
48 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
49 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
50 Note that we don't break the components up in smaller bits
51 (e.g. netloc is a single string) and we don't expand % escapes."""
52 key = url, scheme, allow_fragments
53 cached = _parse_cache.get(key, None)
54 if cached:
55 return cached
56 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
57 clear_cache()
58 netloc = path = params = query = fragment = ''
59 i = url.find(':')
60 if i > 0:
61 if url[:i] == 'http': # optimize the common case
62 scheme = url[:i].lower()
63 url = url[i+1:]
64 if url[:2] == '//':
65 i = url.find('/', 2)
66 if i < 0:
67 i = len(url)
68 netloc = url[2:i]
69 url = url[i:]
70 if allow_fragments:
71 i = url.rfind('#')
72 if i >= 0:
73 fragment = url[i+1:]
74 url = url[:i]
75 i = url.find('?')
76 if i >= 0:
77 query = url[i+1:]
78 url = url[:i]
79 i = url.find(';')
80 if i >= 0:
81 params = url[i+1:]
82 url = url[:i]
83 tuple = scheme, netloc, url, params, query, fragment
84 _parse_cache[key] = tuple
85 return tuple
86 for c in url[:i]:
87 if c not in scheme_chars:
88 break
89 else:
90 scheme, url = url[:i].lower(), url[i+1:]
91 if scheme in uses_netloc:
92 if url[:2] == '//':
93 i = url.find('/', 2)
94 if i < 0:
95 i = len(url)
96 netloc, url = url[2:i], url[i:]
97 if allow_fragments and scheme in uses_fragment:
98 i = url.rfind('#')
99 if i >= 0:
100 url, fragment = url[:i], url[i+1:]
101 if scheme in uses_query:
102 i = url.find('?')
103 if i >= 0:
104 url, query = url[:i], url[i+1:]
105 if scheme in uses_params:
106 i = url.find(';')
107 if i >= 0:
108 url, params = url[:i], url[i+1:]
109 tuple = scheme, netloc, url, params, query, fragment
110 _parse_cache[key] = tuple
111 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000112
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000113def urlunparse((scheme, netloc, url, params, query, fragment)):
Tim Peterse1190062001-01-15 03:34:38 +0000114 """Put a parsed URL back together again. This may result in a
115 slightly different, but equivalent URL, if the URL that was parsed
116 originally had redundant delimiters, e.g. a ? with an empty query
117 (the draft states that these are equivalent)."""
118 if netloc or (scheme in uses_netloc and url[:2] == '//'):
119 if url and url[:1] != '/': url = '/' + url
120 url = '//' + (netloc or '') + url
121 if scheme:
122 url = scheme + ':' + url
123 if params:
124 url = url + ';' + params
125 if query:
126 url = url + '?' + query
127 if fragment:
128 url = url + '#' + fragment
129 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000130
Jeremy Hylton4722da61998-08-25 19:45:24 +0000131def urljoin(base, url, allow_fragments = 1):
Tim Peterse1190062001-01-15 03:34:38 +0000132 """Join a base URL and a possibly relative URL to form an absolute
133 interpretation of the latter."""
134 if not base:
135 return url
136 if not url:
137 return base
138 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
139 urlparse(base, '', allow_fragments)
140 scheme, netloc, path, params, query, fragment = \
141 urlparse(url, bscheme, allow_fragments)
142 if scheme != bscheme or scheme not in uses_relative:
143 return url
144 if scheme in uses_netloc:
145 if netloc:
146 return urlunparse((scheme, netloc, path,
147 params, query, fragment))
148 netloc = bnetloc
149 if path[:1] == '/':
150 return urlunparse((scheme, netloc, path,
151 params, query, fragment))
152 if not path:
153 if not params:
154 params = bparams
155 if not query:
156 query = bquery
157 return urlunparse((scheme, netloc, bpath,
158 params, query, fragment))
159 segments = bpath.split('/')[:-1] + path.split('/')
160 # XXX The stuff below is bogus in various ways...
161 if segments[-1] == '.':
162 segments[-1] = ''
163 while '.' in segments:
164 segments.remove('.')
165 while 1:
166 i = 1
167 n = len(segments) - 1
168 while i < n:
169 if (segments[i] == '..'
170 and segments[i-1] not in ('', '..')):
171 del segments[i-1:i+1]
172 break
173 i = i+1
174 else:
175 break
176 if segments == ['', '..']:
177 segments[-1] = ''
178 elif len(segments) >= 2 and segments[-1] == '..':
179 segments[-2:] = ['']
180 return urlunparse((scheme, netloc, '/'.join(segments),
181 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000182
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000183def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000184 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000185
Tim Peterse1190062001-01-15 03:34:38 +0000186 Returns a tuple of the defragmented URL and the fragment. If
187 the URL contained no fragments, the second element is the
188 empty string.
189 """
190 s, n, p, a, q, frag = urlparse(url)
191 defrag = urlunparse((s, n, p, a, q, ''))
192 return defrag, frag
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000193
194
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000195test_input = """
196 http://a/b/c/d
197
198 g:h = <URL:g:h>
199 http:g = <URL:http://a/b/c/g>
200 http: = <URL:http://a/b/c/d>
201 g = <URL:http://a/b/c/g>
202 ./g = <URL:http://a/b/c/g>
203 g/ = <URL:http://a/b/c/g/>
204 /g = <URL:http://a/g>
205 //g = <URL:http://g>
206 ?y = <URL:http://a/b/c/d?y>
207 g?y = <URL:http://a/b/c/g?y>
208 g?y/./x = <URL:http://a/b/c/g?y/./x>
209 . = <URL:http://a/b/c/>
210 ./ = <URL:http://a/b/c/>
211 .. = <URL:http://a/b/>
212 ../ = <URL:http://a/b/>
213 ../g = <URL:http://a/b/g>
214 ../.. = <URL:http://a/>
215 ../../g = <URL:http://a/g>
216 ../../../g = <URL:http://a/../g>
217 ./../g = <URL:http://a/b/g>
218 ./g/. = <URL:http://a/b/c/g/>
219 /./g = <URL:http://a/./g>
220 g/./h = <URL:http://a/b/c/g/h>
221 g/../h = <URL:http://a/b/c/h>
222 http:g = <URL:http://a/b/c/g>
223 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000224 http:?y = <URL:http://a/b/c/d?y>
225 http:g?y = <URL:http://a/b/c/g?y>
226 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000227"""
Guido van Rossumc08cc501998-12-21 18:24:09 +0000228# XXX The result for //g is actually http://g/; is this a problem?
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000229
230def test():
Tim Peterse1190062001-01-15 03:34:38 +0000231 import sys
232 base = ''
233 if sys.argv[1:]:
234 fn = sys.argv[1]
235 if fn == '-':
236 fp = sys.stdin
237 else:
238 fp = open(fn)
239 else:
240 import StringIO
241 fp = StringIO.StringIO(test_input)
242 while 1:
243 line = fp.readline()
244 if not line: break
245 words = line.split()
246 if not words:
247 continue
248 url = words[0]
249 parts = urlparse(url)
250 print '%-10s : %s' % (url, parts)
251 abs = urljoin(base, url)
252 if not base:
253 base = abs
254 wrapped = '<URL:%s>' % abs
255 print '%-10s = %s' % (url, wrapped)
256 if len(words) == 3 and words[1] == '=':
257 if wrapped != words[2]:
258 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000259
260if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000261 test()