blob: ef2384b08f8213b8e21d229e31be91202138f426 [file] [log] [blame]
Guido van Rossum23cb2a81994-09-12 10:36:35 +00001# Parse (absolute and relative) URLs according to latest internet draft:
2
3# Uniform Resource Identifiers Working Group R. Fielding
4# INTERNET-DRAFT UC Irvine
5# Expires February 24, 1995 August 24, 1994
6#
7# Relative Uniform Resource Locators
8# <draft-ietf-uri-relative-url-00.txt>
9
10# Standard/builtin Python modules
11import string
12
13# A classification of schemes ('' means apply by default)
14uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
15 'prospero', '']
16uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
17 'file', 'prospero', '']
18non_hierarchical = ['gopher', 'mailto', 'news', 'telnet', 'wais']
19uses_params = ['ftp', 'prospero', '']
20uses_query = ['http', 'wais', '']
21uses_fragment = ['ftp', 'http', 'gopher', 'news', 'nntp', 'wais',
22 'file', 'prospero', '']
23
24# Characters valid in scheme names
25scheme_chars = string.letters + string.digits + '+-.'
26
27# Parse a URL into 6 components:
28# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
29# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
30# Note that we don't break the components up in smaller bits
31# (e.g. netloc is a single string) and we don't expand % escapes.
32def urlparse(url, scheme = '', allow_framents = 1):
33 netloc = ''
34 path = ''
35 params = ''
36 query = ''
37 fragment = ''
38 i = string.find(url, ':')
39 if i > 0:
40 for c in url[:i]:
41 if c not in scheme_chars:
42 break
43 else:
44 scheme, url = string.lower(url[:i]), url[i+1:]
45 if scheme in uses_netloc:
46 if url[:2] == '//':
47 i = string.find(url, '/', 2)
48 if i < 0:
49 i = len(url)
50 netloc, url = url[2:i], url[i:]
51 if allow_framents and scheme in uses_fragment:
52 i = string.rfind(url, '#')
53 if i >= 0:
54 url, fragment = url[:i], url[i+1:]
55 if scheme in uses_query:
56 i = string.find(url, '?')
57 if i >= 0:
58 url, query = url[:i], url[i+1:]
59 if scheme in uses_params:
60 i = string.find(url, ';')
61 if i >= 0:
62 url, params = url[:i], url[i+1:]
63 return scheme, netloc, url, params, query, fragment
64
65# Put a parsed URL back together again. This may result in a slightly
66# different, but equivalent URL, if the URL that was parsed originally
67# had redundant delimiters, e.g. a ? with an empty query (the draft
68# states that these are equivalent).
69def urlunparse((scheme, netloc, url, params, query, fragment)):
70 if netloc:
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +000071 if url[:1] != '/': url = '/' + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +000072 url = '//' + netloc + url
73 if scheme:
74 url = scheme + ':' + url
75 if params:
76 url = url + ';' + params
77 if query:
78 url = url + '?' + query
79 if fragment:
80 url = url + '#' + fragment
81 return url
82
83# Join a base URL and a possibly relative URL to form an absolute
84# interpretation of the latter.
85def urljoin(base, url, allow_framents = 1):
86 if not base:
87 return url
88 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
89 urlparse(base, '', allow_framents)
90 scheme, netloc, path, params, query, fragment = \
91 urlparse(url, bscheme, allow_framents)
Guido van Rossuma1124701994-12-30 17:18:59 +000092 # XXX Unofficial hack: default netloc to bnetloc even if
93 # schemes differ
94 if scheme != bscheme and not netloc and \
95 scheme in uses_relative and bscheme in uses_relative and \
96 scheme in uses_netloc and bscheme in uses_netloc:
97 netloc = bnetloc
98 # Strip the port number
99 i = string.find(netloc, '@')
100 if i < 0: i = 0
101 i = string.find(netloc, ':', i)
102 if i >= 0:
103 netloc = netloc[:i]
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000104 if scheme != bscheme or scheme not in uses_relative:
105 return urlunparse((scheme, netloc, path,
106 params, query, fragment))
107 if scheme in uses_netloc:
108 if netloc:
109 return urlunparse((scheme, netloc, path,
110 params, query, fragment))
111 netloc = bnetloc
112 if path[:1] == '/':
113 return urlunparse((scheme, netloc, path,
114 params, query, fragment))
115 if not path:
116 path = bpath
117 if not query:
118 query = bquery
119 return urlunparse((scheme, netloc, path,
120 params, query, fragment))
121 i = string.rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000122 if i >= 0:
123 path = bpath[:i] + '/' + path
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000124 segments = string.splitfields(path, '/')
125 if segments[-1] == '.':
126 segments[-1] = ''
127 while '.' in segments:
128 segments.remove('.')
129 while 1:
130 i = 1
131 n = len(segments) - 1
132 while i < n:
133 if segments[i] == '..' and segments[i-1]:
134 del segments[i-1:i+1]
135 break
136 i = i+1
137 else:
138 break
139 if len(segments) >= 2 and segments[-1] == '..':
140 segments[-2:] = ['']
141 path = string.joinfields(segments, '/')
142 return urlunparse((scheme, netloc, path,
143 params, query, fragment))
144
145test_input = """
146 http://a/b/c/d
147
148 g:h = <URL:g:h>
149 http:g = <URL:http://a/b/c/g>
150 http: = <URL:http://a/b/c/d>
151 g = <URL:http://a/b/c/g>
152 ./g = <URL:http://a/b/c/g>
153 g/ = <URL:http://a/b/c/g/>
154 /g = <URL:http://a/g>
155 //g = <URL:http://g>
156 ?y = <URL:http://a/b/c/d?y>
157 g?y = <URL:http://a/b/c/g?y>
158 g?y/./x = <URL:http://a/b/c/g?y/./x>
159 . = <URL:http://a/b/c/>
160 ./ = <URL:http://a/b/c/>
161 .. = <URL:http://a/b/>
162 ../ = <URL:http://a/b/>
163 ../g = <URL:http://a/b/g>
164 ../.. = <URL:http://a/>
165 ../../g = <URL:http://a/g>
166 ../../../g = <URL:http://a/../g>
167 ./../g = <URL:http://a/b/g>
168 ./g/. = <URL:http://a/b/c/g/>
169 /./g = <URL:http://a/./g>
170 g/./h = <URL:http://a/b/c/g/h>
171 g/../h = <URL:http://a/b/c/h>
172 http:g = <URL:http://a/b/c/g>
173 http: = <URL:http://a/b/c/d>
174"""
175
176def test():
177 import sys
178 base = ''
179 if sys.argv[1:]:
180 fn = sys.argv[1]
181 if fn == '-':
182 fp = sys.stdin
183 else:
184 fp = open(fn)
185 else:
186 import StringIO
187 fp = StringIO.StringIO(test_input)
188 while 1:
189 line = fp.readline()
190 if not line: break
191 words = string.split(line)
192 if not words:
193 continue
194 url = words[0]
195 parts = urlparse(url)
196 print '%-10s : %s' % (url, parts)
197 abs = urljoin(base, url)
198 if not base:
199 base = abs
200 wrapped = '<URL:%s>' % abs
201 print '%-10s = %s' % (url, wrapped)
202 if len(words) == 3 and words[1] == '=':
203 if wrapped != words[2]:
204 print 'EXPECTED', words[2], '!!!!!!!!!!'
205
206if __name__ == '__main__':
207 test()