blob: b9ecee167012924b8c9b79ec47e45b6e5a048343 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
Guido van Rossum23cb2a81994-09-12 10:36:35 +00006
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007# A classification of schemes ('' means apply by default)
8uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
Tim Peterse1190062001-01-15 03:34:38 +00009 'https', 'shttp',
10 'prospero', 'rtsp', 'rtspu', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000011uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Tim Peterse1190062001-01-15 03:34:38 +000012 'file',
13 'https', 'shttp', 'snews',
14 'prospero', 'rtsp', 'rtspu', '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000015non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
Tim Peterse1190062001-01-15 03:34:38 +000016 'snews', 'sip',
17 ]
Guido van Rossumb02092a1997-01-02 18:18:27 +000018uses_params = ['ftp', 'hdl', 'prospero', 'http',
Tim Peterse1190062001-01-15 03:34:38 +000019 'https', 'shttp', 'rtsp', 'rtspu', 'sip',
20 '']
Guido van Rossumb02092a1997-01-02 18:18:27 +000021uses_query = ['http', 'wais',
Tim Peterse1190062001-01-15 03:34:38 +000022 'https', 'shttp',
23 'gopher', 'rtsp', 'rtspu', 'sip',
24 '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000025uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Tim Peterse1190062001-01-15 03:34:38 +000026 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000028
29# Characters valid in scheme names
Guido van Rossumfad81f02000-12-19 16:48:13 +000030scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
32 '0123456789'
33 '+-.')
Guido van Rossum23cb2a81994-09-12 10:36:35 +000034
Guido van Rossum74495401997-07-14 19:08:15 +000035MAX_CACHE_SIZE = 20
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000036_parse_cache = {}
37
38def clear_cache():
Tim Peterse1190062001-01-15 03:34:38 +000039 """Clear the parse cache."""
40 global _parse_cache
41 _parse_cache = {}
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000042
43
Jeremy Hylton4722da61998-08-25 19:45:24 +000044def urlparse(url, scheme = '', allow_fragments = 1):
Tim Peterse1190062001-01-15 03:34:38 +000045 """Parse a URL into 6 components:
46 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
47 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
48 Note that we don't break the components up in smaller bits
49 (e.g. netloc is a single string) and we don't expand % escapes."""
50 key = url, scheme, allow_fragments
51 cached = _parse_cache.get(key, None)
52 if cached:
53 return cached
54 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
55 clear_cache()
56 netloc = path = params = query = fragment = ''
57 i = url.find(':')
58 if i > 0:
59 if url[:i] == 'http': # optimize the common case
60 scheme = url[:i].lower()
61 url = url[i+1:]
62 if url[:2] == '//':
63 i = url.find('/', 2)
64 if i < 0:
65 i = len(url)
66 netloc = url[2:i]
67 url = url[i:]
68 if allow_fragments:
69 i = url.rfind('#')
70 if i >= 0:
71 fragment = url[i+1:]
72 url = url[:i]
73 i = url.find('?')
74 if i >= 0:
75 query = url[i+1:]
76 url = url[:i]
77 i = url.find(';')
78 if i >= 0:
79 params = url[i+1:]
80 url = url[:i]
81 tuple = scheme, netloc, url, params, query, fragment
82 _parse_cache[key] = tuple
83 return tuple
84 for c in url[:i]:
85 if c not in scheme_chars:
86 break
87 else:
88 scheme, url = url[:i].lower(), url[i+1:]
89 if scheme in uses_netloc:
90 if url[:2] == '//':
91 i = url.find('/', 2)
92 if i < 0:
93 i = len(url)
94 netloc, url = url[2:i], url[i:]
95 if allow_fragments and scheme in uses_fragment:
96 i = url.rfind('#')
97 if i >= 0:
98 url, fragment = url[:i], url[i+1:]
99 if scheme in uses_query:
100 i = url.find('?')
101 if i >= 0:
102 url, query = url[:i], url[i+1:]
103 if scheme in uses_params:
104 i = url.find(';')
105 if i >= 0:
106 url, params = url[:i], url[i+1:]
107 tuple = scheme, netloc, url, params, query, fragment
108 _parse_cache[key] = tuple
109 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000110
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000111def urlunparse((scheme, netloc, url, params, query, fragment)):
Tim Peterse1190062001-01-15 03:34:38 +0000112 """Put a parsed URL back together again. This may result in a
113 slightly different, but equivalent URL, if the URL that was parsed
114 originally had redundant delimiters, e.g. a ? with an empty query
115 (the draft states that these are equivalent)."""
116 if netloc or (scheme in uses_netloc and url[:2] == '//'):
117 if url and url[:1] != '/': url = '/' + url
118 url = '//' + (netloc or '') + url
119 if scheme:
120 url = scheme + ':' + url
121 if params:
122 url = url + ';' + params
123 if query:
124 url = url + '?' + query
125 if fragment:
126 url = url + '#' + fragment
127 return url
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000128
Jeremy Hylton4722da61998-08-25 19:45:24 +0000129def urljoin(base, url, allow_fragments = 1):
Tim Peterse1190062001-01-15 03:34:38 +0000130 """Join a base URL and a possibly relative URL to form an absolute
131 interpretation of the latter."""
132 if not base:
133 return url
134 if not url:
135 return base
136 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
137 urlparse(base, '', allow_fragments)
138 scheme, netloc, path, params, query, fragment = \
139 urlparse(url, bscheme, allow_fragments)
140 if scheme != bscheme or scheme not in uses_relative:
141 return url
142 if scheme in uses_netloc:
143 if netloc:
144 return urlunparse((scheme, netloc, path,
145 params, query, fragment))
146 netloc = bnetloc
147 if path[:1] == '/':
148 return urlunparse((scheme, netloc, path,
149 params, query, fragment))
150 if not path:
151 if not params:
152 params = bparams
153 if not query:
154 query = bquery
155 return urlunparse((scheme, netloc, bpath,
156 params, query, fragment))
157 segments = bpath.split('/')[:-1] + path.split('/')
158 # XXX The stuff below is bogus in various ways...
159 if segments[-1] == '.':
160 segments[-1] = ''
161 while '.' in segments:
162 segments.remove('.')
163 while 1:
164 i = 1
165 n = len(segments) - 1
166 while i < n:
167 if (segments[i] == '..'
168 and segments[i-1] not in ('', '..')):
169 del segments[i-1:i+1]
170 break
171 i = i+1
172 else:
173 break
174 if segments == ['', '..']:
175 segments[-1] = ''
176 elif len(segments) >= 2 and segments[-1] == '..':
177 segments[-2:] = ['']
178 return urlunparse((scheme, netloc, '/'.join(segments),
179 params, query, fragment))
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000180
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000181def urldefrag(url):
Tim Peterse1190062001-01-15 03:34:38 +0000182 """Removes any existing fragment from URL.
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000183
Tim Peterse1190062001-01-15 03:34:38 +0000184 Returns a tuple of the defragmented URL and the fragment. If
185 the URL contained no fragments, the second element is the
186 empty string.
187 """
188 s, n, p, a, q, frag = urlparse(url)
189 defrag = urlunparse((s, n, p, a, q, ''))
190 return defrag, frag
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000191
192
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000193test_input = """
194 http://a/b/c/d
195
196 g:h = <URL:g:h>
197 http:g = <URL:http://a/b/c/g>
198 http: = <URL:http://a/b/c/d>
199 g = <URL:http://a/b/c/g>
200 ./g = <URL:http://a/b/c/g>
201 g/ = <URL:http://a/b/c/g/>
202 /g = <URL:http://a/g>
203 //g = <URL:http://g>
204 ?y = <URL:http://a/b/c/d?y>
205 g?y = <URL:http://a/b/c/g?y>
206 g?y/./x = <URL:http://a/b/c/g?y/./x>
207 . = <URL:http://a/b/c/>
208 ./ = <URL:http://a/b/c/>
209 .. = <URL:http://a/b/>
210 ../ = <URL:http://a/b/>
211 ../g = <URL:http://a/b/g>
212 ../.. = <URL:http://a/>
213 ../../g = <URL:http://a/g>
214 ../../../g = <URL:http://a/../g>
215 ./../g = <URL:http://a/b/g>
216 ./g/. = <URL:http://a/b/c/g/>
217 /./g = <URL:http://a/./g>
218 g/./h = <URL:http://a/b/c/g/h>
219 g/../h = <URL:http://a/b/c/h>
220 http:g = <URL:http://a/b/c/g>
221 http: = <URL:http://a/b/c/d>
Andrew M. Kuchling5c355201999-01-06 22:13:09 +0000222 http:?y = <URL:http://a/b/c/d?y>
223 http:g?y = <URL:http://a/b/c/g?y>
224 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000225"""
Guido van Rossumc08cc501998-12-21 18:24:09 +0000226# XXX The result for //g is actually http://g/; is this a problem?
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000227
228def test():
Tim Peterse1190062001-01-15 03:34:38 +0000229 import sys
230 base = ''
231 if sys.argv[1:]:
232 fn = sys.argv[1]
233 if fn == '-':
234 fp = sys.stdin
235 else:
236 fp = open(fn)
237 else:
238 import StringIO
239 fp = StringIO.StringIO(test_input)
240 while 1:
241 line = fp.readline()
242 if not line: break
243 words = line.split()
244 if not words:
245 continue
246 url = words[0]
247 parts = urlparse(url)
248 print '%-10s : %s' % (url, parts)
249 abs = urljoin(base, url)
250 if not base:
251 base = abs
252 wrapped = '<URL:%s>' % abs
253 print '%-10s = %s' % (url, wrapped)
254 if len(words) == 3 and words[1] == '=':
255 if wrapped != words[2]:
256 print 'EXPECTED', words[2], '!!!!!!!!!!'
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000257
258if __name__ == '__main__':
Tim Peterse1190062001-01-15 03:34:38 +0000259 test()