blob: 08722b17d3036e3c4460b53400f237e1e2db191f [file] [log] [blame]
Guido van Rossumededb581996-03-29 21:23:25 +00001# Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2# Resource Locators", by R. Fielding, UC Irvine, June 1995.
Guido van Rossum23cb2a81994-09-12 10:36:35 +00003
4# Standard/builtin Python modules
5import string
6
7# A classification of schemes ('' means apply by default)
8uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
9 'prospero', '']
10uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
Guido van Rossum1a16c861995-08-10 19:45:41 +000011 'prospero', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000012non_hierarchical = ['gopher', 'mailto', 'news', 'telnet', 'wais']
Guido van Rossumededb581996-03-29 21:23:25 +000013uses_params = ['ftp', 'prospero', 'http', '']
Guido van Rossum23cb2a81994-09-12 10:36:35 +000014uses_query = ['http', 'wais', '']
15uses_fragment = ['ftp', 'http', 'gopher', 'news', 'nntp', 'wais',
16 'file', 'prospero', '']
17
18# Characters valid in scheme names
19scheme_chars = string.letters + string.digits + '+-.'
20
21# Parse a URL into 6 components:
22# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
23# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
24# Note that we don't break the components up in smaller bits
25# (e.g. netloc is a single string) and we don't expand % escapes.
26def urlparse(url, scheme = '', allow_framents = 1):
27 netloc = ''
28 path = ''
29 params = ''
30 query = ''
31 fragment = ''
32 i = string.find(url, ':')
33 if i > 0:
34 for c in url[:i]:
35 if c not in scheme_chars:
36 break
37 else:
38 scheme, url = string.lower(url[:i]), url[i+1:]
39 if scheme in uses_netloc:
40 if url[:2] == '//':
41 i = string.find(url, '/', 2)
42 if i < 0:
43 i = len(url)
44 netloc, url = url[2:i], url[i:]
45 if allow_framents and scheme in uses_fragment:
46 i = string.rfind(url, '#')
47 if i >= 0:
48 url, fragment = url[:i], url[i+1:]
49 if scheme in uses_query:
50 i = string.find(url, '?')
51 if i >= 0:
52 url, query = url[:i], url[i+1:]
53 if scheme in uses_params:
54 i = string.find(url, ';')
55 if i >= 0:
56 url, params = url[:i], url[i+1:]
57 return scheme, netloc, url, params, query, fragment
58
59# Put a parsed URL back together again. This may result in a slightly
60# different, but equivalent URL, if the URL that was parsed originally
61# had redundant delimiters, e.g. a ? with an empty query (the draft
62# states that these are equivalent).
63def urlunparse((scheme, netloc, url, params, query, fragment)):
64 if netloc:
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +000065 if url[:1] != '/': url = '/' + url
Guido van Rossum23cb2a81994-09-12 10:36:35 +000066 url = '//' + netloc + url
67 if scheme:
68 url = scheme + ':' + url
69 if params:
70 url = url + ';' + params
71 if query:
72 url = url + '?' + query
73 if fragment:
74 url = url + '#' + fragment
75 return url
76
77# Join a base URL and a possibly relative URL to form an absolute
78# interpretation of the latter.
79def urljoin(base, url, allow_framents = 1):
80 if not base:
81 return url
82 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
83 urlparse(base, '', allow_framents)
84 scheme, netloc, path, params, query, fragment = \
85 urlparse(url, bscheme, allow_framents)
Guido van Rossuma1124701994-12-30 17:18:59 +000086 # XXX Unofficial hack: default netloc to bnetloc even if
87 # schemes differ
88 if scheme != bscheme and not netloc and \
89 scheme in uses_relative and bscheme in uses_relative and \
90 scheme in uses_netloc and bscheme in uses_netloc:
91 netloc = bnetloc
92 # Strip the port number
93 i = string.find(netloc, '@')
94 if i < 0: i = 0
95 i = string.find(netloc, ':', i)
96 if i >= 0:
97 netloc = netloc[:i]
Guido van Rossum23cb2a81994-09-12 10:36:35 +000098 if scheme != bscheme or scheme not in uses_relative:
99 return urlunparse((scheme, netloc, path,
100 params, query, fragment))
101 if scheme in uses_netloc:
102 if netloc:
103 return urlunparse((scheme, netloc, path,
104 params, query, fragment))
105 netloc = bnetloc
106 if path[:1] == '/':
107 return urlunparse((scheme, netloc, path,
108 params, query, fragment))
109 if not path:
110 path = bpath
111 if not query:
112 query = bquery
113 return urlunparse((scheme, netloc, path,
114 params, query, fragment))
115 i = string.rfind(bpath, '/')
Guido van Rossumfb1a0cd1995-08-04 04:29:32 +0000116 if i >= 0:
117 path = bpath[:i] + '/' + path
Guido van Rossum23cb2a81994-09-12 10:36:35 +0000118 segments = string.splitfields(path, '/')
119 if segments[-1] == '.':
120 segments[-1] = ''
121 while '.' in segments:
122 segments.remove('.')
123 while 1:
124 i = 1
125 n = len(segments) - 1
126 while i < n:
127 if segments[i] == '..' and segments[i-1]:
128 del segments[i-1:i+1]
129 break
130 i = i+1
131 else:
132 break
133 if len(segments) >= 2 and segments[-1] == '..':
134 segments[-2:] = ['']
135 path = string.joinfields(segments, '/')
136 return urlunparse((scheme, netloc, path,
137 params, query, fragment))
138
139test_input = """
140 http://a/b/c/d
141
142 g:h = <URL:g:h>
143 http:g = <URL:http://a/b/c/g>
144 http: = <URL:http://a/b/c/d>
145 g = <URL:http://a/b/c/g>
146 ./g = <URL:http://a/b/c/g>
147 g/ = <URL:http://a/b/c/g/>
148 /g = <URL:http://a/g>
149 //g = <URL:http://g>
150 ?y = <URL:http://a/b/c/d?y>
151 g?y = <URL:http://a/b/c/g?y>
152 g?y/./x = <URL:http://a/b/c/g?y/./x>
153 . = <URL:http://a/b/c/>
154 ./ = <URL:http://a/b/c/>
155 .. = <URL:http://a/b/>
156 ../ = <URL:http://a/b/>
157 ../g = <URL:http://a/b/g>
158 ../.. = <URL:http://a/>
159 ../../g = <URL:http://a/g>
160 ../../../g = <URL:http://a/../g>
161 ./../g = <URL:http://a/b/g>
162 ./g/. = <URL:http://a/b/c/g/>
163 /./g = <URL:http://a/./g>
164 g/./h = <URL:http://a/b/c/g/h>
165 g/../h = <URL:http://a/b/c/h>
166 http:g = <URL:http://a/b/c/g>
167 http: = <URL:http://a/b/c/d>
168"""
169
170def test():
171 import sys
172 base = ''
173 if sys.argv[1:]:
174 fn = sys.argv[1]
175 if fn == '-':
176 fp = sys.stdin
177 else:
178 fp = open(fn)
179 else:
180 import StringIO
181 fp = StringIO.StringIO(test_input)
182 while 1:
183 line = fp.readline()
184 if not line: break
185 words = string.split(line)
186 if not words:
187 continue
188 url = words[0]
189 parts = urlparse(url)
190 print '%-10s : %s' % (url, parts)
191 abs = urljoin(base, url)
192 if not base:
193 base = abs
194 wrapped = '<URL:%s>' % abs
195 print '%-10s = %s' % (url, wrapped)
196 if len(words) == 3 and words[1] == '=':
197 if wrapped != words[2]:
198 print 'EXPECTED', words[2], '!!!!!!!!!!'
199
200if __name__ == '__main__':
201 test()