blob: 571ef0eb0c6194716b2eaa49b7e12b3e8ffb8e88 [file] [log] [blame]
Guido van Rossumededb581996-03-29 21:23:25 +00001# Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2# Resource Locators", by R. Fielding, UC Irvine, June 1995.
Guido van Rossum23cb2a81994-09-12 10:36:35 +00003
4# Standard/builtin Python modules
5import string
Guido van Rossum3fd32ec1996-05-28 23:54:24 +00006from string import joinfields, splitfields, find, rfind
Guido van Rossum23cb2a81994-09-12 10:36:35 +00007
8# A classification of schemes ('' means apply by default)
9uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
10 'prospero', '']
11uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossum1a16c861995-08-10 19:45:41 +000012 'prospero', '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000013non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais']
14uses_params = ['ftp', 'hdl', 'prospero', 'http', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000015uses_query = ['http', 'wais', '']
Guido van Rossum5feb54c1996-05-28 23:10:02 +000016uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
Guido van Rossum23cb2a81994-09-12 10:36:35 +000017 'file', 'prospero', '']
18
19# Characters valid in scheme names
20scheme_chars = string.letters + string.digits + '+-.'
21
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000022_parse_cache = {}
23
24def clear_cache():
25 global _parse_cache
26 _parse_cache = {}
27
28
Guido van Rossum23cb2a81994-09-12 10:36:35 +000029# Parse a URL into 6 components:
30# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
31# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
32# Note that we don't break the components up in smaller bits
33# (e.g. netloc is a single string) and we don't expand % escapes.
34def urlparse(url, scheme = '', allow_framents = 1):
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000035 key = url, scheme, allow_framents
36 if _parse_cache.has_key(key):
37 return _parse_cache[key]
38 netloc = path = params = query = fragment = ''
Guido van Rossum23cb2a81994-09-12 10:36:35 +000039 i = string.find(url, ':')
40 if i > 0:
41 for c in url[:i]:
42 if c not in scheme_chars:
43 break
44 else:
45 scheme, url = string.lower(url[:i]), url[i+1:]
46 if scheme in uses_netloc:
47 if url[:2] == '//':
48 i = string.find(url, '/', 2)
49 if i < 0:
50 i = len(url)
51 netloc, url = url[2:i], url[i:]
52 if allow_framents and scheme in uses_fragment:
53 i = string.rfind(url, '#')
54 if i >= 0:
55 url, fragment = url[:i], url[i+1:]
56 if scheme in uses_query:
57 i = string.find(url, '?')
58 if i >= 0:
59 url, query = url[:i], url[i+1:]
60 if scheme in uses_params:
61 i = string.find(url, ';')
62 if i >= 0:
63 url, params = url[:i], url[i+1:]
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000064 tuple = scheme, netloc, url, params, query, fragment
65 _parse_cache[key] = tuple
66 return tuple
Guido van Rossum23cb2a81994-09-12 10:36:35 +000067
68# Put a parsed URL back together again. This may result in a slightly
69# different, but equivalent URL, if the URL that was parsed originally
70# had redundant delimiters, e.g. a ? with an empty query (the draft
71# states that these are equivalent).
72def urlunparse((scheme, netloc, url, params, query, fragment)):
73 if netloc:
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +000074 if url[:1] != '/': url = '/' + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +000075 url = '//' + netloc + url
76 if scheme:
77 url = scheme + ':' + url
78 if params:
79 url = url + ';' + params
80 if query:
81 url = url + '?' + query
82 if fragment:
83 url = url + '#' + fragment
84 return url
85
86# Join a base URL and a possibly relative URL to form an absolute
87# interpretation of the latter.
88def urljoin(base, url, allow_framents = 1):
89 if not base:
90 return url
91 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
Guido van Rossum3fd32ec1996-05-28 23:54:24 +000092 urlparse(base, '', allow_framents)
Guido van Rossum23cb2a81994-09-12 10:36:35 +000093 scheme, netloc, path, params, query, fragment = \
94 urlparse(url, bscheme, allow_framents)
Guido van Rossuma1124701994-12-30 17:18:59 +000095 # XXX Unofficial hack: default netloc to bnetloc even if
96 # schemes differ
97 if scheme != bscheme and not netloc and \
98 scheme in uses_relative and bscheme in uses_relative and \
99 scheme in uses_netloc and bscheme in uses_netloc:
100 netloc = bnetloc
101 # Strip the port number
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000102 i = find(netloc, '@')
Guido van Rossuma1124701994-12-30 17:18:59 +0000103 if i < 0: i = 0
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000104 i = find(netloc, ':', i)
Guido van Rossuma1124701994-12-30 17:18:59 +0000105 if i >= 0:
106 netloc = netloc[:i]
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000107 if scheme != bscheme or scheme not in uses_relative:
108 return urlunparse((scheme, netloc, path,
109 params, query, fragment))
110 if scheme in uses_netloc:
111 if netloc:
112 return urlunparse((scheme, netloc, path,
113 params, query, fragment))
114 netloc = bnetloc
115 if path[:1] == '/':
116 return urlunparse((scheme, netloc, path,
117 params, query, fragment))
118 if not path:
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000119 return urlunparse((scheme, netloc, bpath,
120 params, query or bquery, fragment))
121 i = rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000122 if i >= 0:
123 path = bpath[:i] + '/' + path
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000124 segments = splitfields(path, '/')
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000125 if segments[-1] == '.':
126 segments[-1] = ''
127 while '.' in segments:
128 segments.remove('.')
129 while 1:
130 i = 1
131 n = len(segments) - 1
132 while i < n:
133 if segments[i] == '..' and segments[i-1]:
134 del segments[i-1:i+1]
135 break
136 i = i+1
137 else:
138 break
139 if len(segments) >= 2 and segments[-1] == '..':
140 segments[-2:] = ['']
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000141 return urlunparse((scheme, netloc, joinfields(segments, '/'),
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000142 params, query, fragment))
143
Guido van Rossum3fd32ec1996-05-28 23:54:24 +0000144def urldefrag(url):
145 """Removes any existing fragment from URL.
146
147 Returns a tuple of the defragmented URL and the fragment. If
148 the URL contained no fragments, the second element is the
149 empty string.
150 """
151 s, n, p, a, q, frag = urlparse(url)
152 defrag = urlunparse((s, n, p, a, q, ''))
153 return defrag, frag
154
155
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000156test_input = """
157 http://a/b/c/d
158
159 g:h = <URL:g:h>
160 http:g = <URL:http://a/b/c/g>
161 http: = <URL:http://a/b/c/d>
162 g = <URL:http://a/b/c/g>
163 ./g = <URL:http://a/b/c/g>
164 g/ = <URL:http://a/b/c/g/>
165 /g = <URL:http://a/g>
166 //g = <URL:http://g>
167 ?y = <URL:http://a/b/c/d?y>
168 g?y = <URL:http://a/b/c/g?y>
169 g?y/./x = <URL:http://a/b/c/g?y/./x>
170 . = <URL:http://a/b/c/>
171 ./ = <URL:http://a/b/c/>
172 .. = <URL:http://a/b/>
173 ../ = <URL:http://a/b/>
174 ../g = <URL:http://a/b/g>
175 ../.. = <URL:http://a/>
176 ../../g = <URL:http://a/g>
177 ../../../g = <URL:http://a/../g>
178 ./../g = <URL:http://a/b/g>
179 ./g/. = <URL:http://a/b/c/g/>
180 /./g = <URL:http://a/./g>
181 g/./h = <URL:http://a/b/c/g/h>
182 g/../h = <URL:http://a/b/c/h>
183 http:g = <URL:http://a/b/c/g>
184 http: = <URL:http://a/b/c/d>
185"""
186
187def test():
188 import sys
189 base = ''
190 if sys.argv[1:]:
191 fn = sys.argv[1]
192 if fn == '-':
193 fp = sys.stdin
194 else:
195 fp = open(fn)
196 else:
197 import StringIO
198 fp = StringIO.StringIO(test_input)
199 while 1:
200 line = fp.readline()
201 if not line: break
202 words = string.split(line)
203 if not words:
204 continue
205 url = words[0]
206 parts = urlparse(url)
207 print '%-10s : %s' % (url, parts)
208 abs = urljoin(base, url)
209 if not base:
210 base = abs
211 wrapped = '<URL:%s>' % abs
212 print '%-10s = %s' % (url, wrapped)
213 if len(words) == 3 and words[1] == '=':
214 if wrapped != words[2]:
215 print 'EXPECTED', words[2], '!!!!!!!!!!'
216
217if __name__ == '__main__':
218 test()