blob: 103f89d4c5a15473494efdc2e6bc722ae6f06c11 [file] [log] [blame]
Skip Montanaro6ec967d2002-03-23 05:32:10 +00001#! /usr/bin/env python
2
Benjamin Petersonee8712c2008-05-20 21:35:26 +00003from test import support
Skip Montanaro6ec967d2002-03-23 05:32:10 +00004import unittest
Jeremy Hylton1afc1692008-06-18 20:49:58 +00005import urllib.parse
Fred Drakea4d18a02001-01-05 05:57:04 +00006
Fred Drakea4d18a02001-01-05 05:57:04 +00007RFC1808_BASE = "http://a/b/c/d;p?q#f"
Skip Montanaro6ec967d2002-03-23 05:32:10 +00008RFC2396_BASE = "http://a/b/c/d;p?q"
Facundo Batista23e38562008-08-14 16:55:14 +00009RFC3986_BASE = "http://a/b/c/d;p?q"
Fred Drakea4d18a02001-01-05 05:57:04 +000010
Skip Montanaro6ec967d2002-03-23 05:32:10 +000011class UrlParseTestCase(unittest.TestCase):
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000012
13 def checkRoundtrips(self, url, parsed, split):
Jeremy Hylton1afc1692008-06-18 20:49:58 +000014 result = urllib.parse.urlparse(url)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000015 self.assertEqual(result, parsed)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016 t = (result.scheme, result.netloc, result.path,
17 result.params, result.query, result.fragment)
18 self.assertEqual(t, parsed)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000019 # put it back together and it should be the same
Jeremy Hylton1afc1692008-06-18 20:49:58 +000020 result2 = urllib.parse.urlunparse(result)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000021 self.assertEqual(result2, url)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000022 self.assertEqual(result2, result.geturl())
23
24 # the result of geturl() is a fixpoint; we can always parse it
25 # again to get the same result:
Jeremy Hylton1afc1692008-06-18 20:49:58 +000026 result3 = urllib.parse.urlparse(result.geturl())
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000027 self.assertEqual(result3.geturl(), result.geturl())
28 self.assertEqual(result3, result)
29 self.assertEqual(result3.scheme, result.scheme)
30 self.assertEqual(result3.netloc, result.netloc)
31 self.assertEqual(result3.path, result.path)
32 self.assertEqual(result3.params, result.params)
33 self.assertEqual(result3.query, result.query)
34 self.assertEqual(result3.fragment, result.fragment)
35 self.assertEqual(result3.username, result.username)
36 self.assertEqual(result3.password, result.password)
37 self.assertEqual(result3.hostname, result.hostname)
38 self.assertEqual(result3.port, result.port)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000039
40 # check the roundtrip using urlsplit() as well
Jeremy Hylton1afc1692008-06-18 20:49:58 +000041 result = urllib.parse.urlsplit(url)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000042 self.assertEqual(result, split)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000043 t = (result.scheme, result.netloc, result.path,
44 result.query, result.fragment)
45 self.assertEqual(t, split)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046 result2 = urllib.parse.urlunsplit(result)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000047 self.assertEqual(result2, url)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000048 self.assertEqual(result2, result.geturl())
49
50 # check the fixpoint property of re-parsing the result of geturl()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051 result3 = urllib.parse.urlsplit(result.geturl())
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000052 self.assertEqual(result3.geturl(), result.geturl())
53 self.assertEqual(result3, result)
54 self.assertEqual(result3.scheme, result.scheme)
55 self.assertEqual(result3.netloc, result.netloc)
56 self.assertEqual(result3.path, result.path)
57 self.assertEqual(result3.query, result.query)
58 self.assertEqual(result3.fragment, result.fragment)
59 self.assertEqual(result3.username, result.username)
60 self.assertEqual(result3.password, result.password)
61 self.assertEqual(result3.hostname, result.hostname)
62 self.assertEqual(result3.port, result.port)
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000063
64 def test_roundtrips(self):
65 testcases = [
Fred Drake70705652002-10-16 21:02:36 +000066 ('file:///tmp/junk.txt',
67 ('file', '', '/tmp/junk.txt', '', '', ''),
68 ('file', '', '/tmp/junk.txt', '', '')),
Neal Norwitz68b539e2003-01-06 06:58:31 +000069 ('imap://mail.python.org/mbox1',
70 ('imap', 'mail.python.org', '/mbox1', '', '', ''),
71 ('imap', 'mail.python.org', '/mbox1', '', '')),
Skip Montanarof09b88e2003-01-06 20:27:03 +000072 ('mms://wms.sys.hinet.net/cts/Drama/09006251100.asf',
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000073 ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
74 '', '', ''),
75 ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
76 '', '')),
Fred Drake50747fc2005-07-29 15:56:32 +000077 ('svn+ssh://svn.zope.org/repos/main/ZConfig/trunk/',
78 ('svn+ssh', 'svn.zope.org', '/repos/main/ZConfig/trunk/',
79 '', '', ''),
80 ('svn+ssh', 'svn.zope.org', '/repos/main/ZConfig/trunk/',
81 '', ''))
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000082 ]
83 for url, parsed, split in testcases:
84 self.checkRoundtrips(url, parsed, split)
Michael W. Hudsonbd3e7712002-03-18 13:06:00 +000085
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000086 def test_http_roundtrips(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +000087 # urllib.parse.urlsplit treats 'http:' as an optimized special case,
Johannes Gijsbers41e4faa2005-01-09 15:29:10 +000088 # so we test both 'http:' and 'https:' in all the following.
89 # Three cheers for white box knowledge!
90 testcases = [
91 ('://www.python.org',
92 ('www.python.org', '', '', '', ''),
93 ('www.python.org', '', '', '')),
94 ('://www.python.org#abc',
95 ('www.python.org', '', '', '', 'abc'),
96 ('www.python.org', '', '', 'abc')),
97 ('://www.python.org?q=abc',
98 ('www.python.org', '', '', 'q=abc', ''),
99 ('www.python.org', '', 'q=abc', '')),
100 ('://www.python.org/#abc',
101 ('www.python.org', '/', '', '', 'abc'),
102 ('www.python.org', '/', '', 'abc')),
103 ('://a/b/c/d;p?q#f',
104 ('a', '/b/c/d', 'p', 'q', 'f'),
105 ('a', '/b/c/d;p', 'q', 'f')),
106 ]
107 for scheme in ('http', 'https'):
108 for url, parsed, split in testcases:
109 url = scheme + url
110 parsed = (scheme,) + parsed
111 split = (scheme,) + split
112 self.checkRoundtrips(url, parsed, split)
Fred Drake70705652002-10-16 21:02:36 +0000113
Skip Montanaro6ec967d2002-03-23 05:32:10 +0000114 def checkJoin(self, base, relurl, expected):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000115 self.assertEqual(urllib.parse.urljoin(base, relurl), expected,
Guido van Rossumbbc05682002-10-14 19:59:54 +0000116 (base, relurl, expected))
117
118 def test_unparse_parse(self):
119 for u in ['Python', './Python']:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000120 self.assertEqual(urllib.parse.urlunsplit(urllib.parse.urlsplit(u)), u)
121 self.assertEqual(urllib.parse.urlunparse(urllib.parse.urlparse(u)), u)
Fred Drakea4d18a02001-01-05 05:57:04 +0000122
Skip Montanaro6ec967d2002-03-23 05:32:10 +0000123 def test_RFC1808(self):
124 # "normal" cases from RFC 1808:
125 self.checkJoin(RFC1808_BASE, 'g:h', 'g:h')
126 self.checkJoin(RFC1808_BASE, 'g', 'http://a/b/c/g')
127 self.checkJoin(RFC1808_BASE, './g', 'http://a/b/c/g')
128 self.checkJoin(RFC1808_BASE, 'g/', 'http://a/b/c/g/')
129 self.checkJoin(RFC1808_BASE, '/g', 'http://a/g')
130 self.checkJoin(RFC1808_BASE, '//g', 'http://g')
Skip Montanaro6ec967d2002-03-23 05:32:10 +0000131 self.checkJoin(RFC1808_BASE, 'g?y', 'http://a/b/c/g?y')
132 self.checkJoin(RFC1808_BASE, 'g?y/./x', 'http://a/b/c/g?y/./x')
133 self.checkJoin(RFC1808_BASE, '#s', 'http://a/b/c/d;p?q#s')
134 self.checkJoin(RFC1808_BASE, 'g#s', 'http://a/b/c/g#s')
135 self.checkJoin(RFC1808_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
136 self.checkJoin(RFC1808_BASE, 'g?y#s', 'http://a/b/c/g?y#s')
Skip Montanaro6ec967d2002-03-23 05:32:10 +0000137 self.checkJoin(RFC1808_BASE, 'g;x', 'http://a/b/c/g;x')
138 self.checkJoin(RFC1808_BASE, 'g;x?y#s', 'http://a/b/c/g;x?y#s')
139 self.checkJoin(RFC1808_BASE, '.', 'http://a/b/c/')
140 self.checkJoin(RFC1808_BASE, './', 'http://a/b/c/')
141 self.checkJoin(RFC1808_BASE, '..', 'http://a/b/')
142 self.checkJoin(RFC1808_BASE, '../', 'http://a/b/')
143 self.checkJoin(RFC1808_BASE, '../g', 'http://a/b/g')
144 self.checkJoin(RFC1808_BASE, '../..', 'http://a/')
145 self.checkJoin(RFC1808_BASE, '../../', 'http://a/')
146 self.checkJoin(RFC1808_BASE, '../../g', 'http://a/g')
Fred Drakea4d18a02001-01-05 05:57:04 +0000147
Skip Montanaro6ec967d2002-03-23 05:32:10 +0000148 # "abnormal" cases from RFC 1808:
149 self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f')
150 self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g')
151 self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g')
152 self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g')
153 self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g')
154 self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.')
155 self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g')
156 self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..')
157 self.checkJoin(RFC1808_BASE, '..g', 'http://a/b/c/..g')
158 self.checkJoin(RFC1808_BASE, './../g', 'http://a/b/g')
159 self.checkJoin(RFC1808_BASE, './g/.', 'http://a/b/c/g/')
160 self.checkJoin(RFC1808_BASE, 'g/./h', 'http://a/b/c/g/h')
161 self.checkJoin(RFC1808_BASE, 'g/../h', 'http://a/b/c/h')
Fred Drakea4d18a02001-01-05 05:57:04 +0000162
Skip Montanaro6ec967d2002-03-23 05:32:10 +0000163 # RFC 1808 and RFC 1630 disagree on these (according to RFC 1808),
164 # so we'll not actually run these tests (which expect 1808 behavior).
165 #self.checkJoin(RFC1808_BASE, 'http:g', 'http:g')
166 #self.checkJoin(RFC1808_BASE, 'http:', 'http:')
Fred Drakea4d18a02001-01-05 05:57:04 +0000167
Skip Montanaro6ec967d2002-03-23 05:32:10 +0000168 def test_RFC2396(self):
169 # cases from RFC 2396
Fred Drakea4d18a02001-01-05 05:57:04 +0000170
Skip Montanaro6ec967d2002-03-23 05:32:10 +0000171
172 self.checkJoin(RFC2396_BASE, 'g:h', 'g:h')
173 self.checkJoin(RFC2396_BASE, 'g', 'http://a/b/c/g')
174 self.checkJoin(RFC2396_BASE, './g', 'http://a/b/c/g')
175 self.checkJoin(RFC2396_BASE, 'g/', 'http://a/b/c/g/')
176 self.checkJoin(RFC2396_BASE, '/g', 'http://a/g')
177 self.checkJoin(RFC2396_BASE, '//g', 'http://g')
178 self.checkJoin(RFC2396_BASE, 'g?y', 'http://a/b/c/g?y')
179 self.checkJoin(RFC2396_BASE, '#s', 'http://a/b/c/d;p?q#s')
180 self.checkJoin(RFC2396_BASE, 'g#s', 'http://a/b/c/g#s')
181 self.checkJoin(RFC2396_BASE, 'g?y#s', 'http://a/b/c/g?y#s')
182 self.checkJoin(RFC2396_BASE, 'g;x', 'http://a/b/c/g;x')
183 self.checkJoin(RFC2396_BASE, 'g;x?y#s', 'http://a/b/c/g;x?y#s')
184 self.checkJoin(RFC2396_BASE, '.', 'http://a/b/c/')
185 self.checkJoin(RFC2396_BASE, './', 'http://a/b/c/')
186 self.checkJoin(RFC2396_BASE, '..', 'http://a/b/')
187 self.checkJoin(RFC2396_BASE, '../', 'http://a/b/')
188 self.checkJoin(RFC2396_BASE, '../g', 'http://a/b/g')
189 self.checkJoin(RFC2396_BASE, '../..', 'http://a/')
190 self.checkJoin(RFC2396_BASE, '../../', 'http://a/')
191 self.checkJoin(RFC2396_BASE, '../../g', 'http://a/g')
192 self.checkJoin(RFC2396_BASE, '', RFC2396_BASE)
193 self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g')
194 self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g')
195 self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g')
196 self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g')
197 self.checkJoin(RFC2396_BASE, 'g.', 'http://a/b/c/g.')
198 self.checkJoin(RFC2396_BASE, '.g', 'http://a/b/c/.g')
199 self.checkJoin(RFC2396_BASE, 'g..', 'http://a/b/c/g..')
200 self.checkJoin(RFC2396_BASE, '..g', 'http://a/b/c/..g')
201 self.checkJoin(RFC2396_BASE, './../g', 'http://a/b/g')
202 self.checkJoin(RFC2396_BASE, './g/.', 'http://a/b/c/g/')
203 self.checkJoin(RFC2396_BASE, 'g/./h', 'http://a/b/c/g/h')
204 self.checkJoin(RFC2396_BASE, 'g/../h', 'http://a/b/c/h')
205 self.checkJoin(RFC2396_BASE, 'g;x=1/./y', 'http://a/b/c/g;x=1/y')
206 self.checkJoin(RFC2396_BASE, 'g;x=1/../y', 'http://a/b/c/y')
207 self.checkJoin(RFC2396_BASE, 'g?y/./x', 'http://a/b/c/g?y/./x')
208 self.checkJoin(RFC2396_BASE, 'g?y/../x', 'http://a/b/c/g?y/../x')
209 self.checkJoin(RFC2396_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
210 self.checkJoin(RFC2396_BASE, 'g#s/../x', 'http://a/b/c/g#s/../x')
211
Facundo Batista23e38562008-08-14 16:55:14 +0000212 #The following scenarios have been updated in RFC3986
213 #self.checkJoin(RFC2396_BASE, '?y', 'http://a/b/c/?y')
214 #self.checkJoin(RFC2396_BASE, ';x', 'http://a/b/c/;x')
215
216 def test_RFC3986(self):
217 self.checkJoin(RFC3986_BASE, '?y','http://a/b/c/d;p?y')
218 self.checkJoin(RFC2396_BASE, ';x', 'http://a/b/c/;x')
219
Fred Drake70705652002-10-16 21:02:36 +0000220 def test_urldefrag(self):
221 for url, defrag, frag in [
222 ('http://python.org#frag', 'http://python.org', 'frag'),
223 ('http://python.org', 'http://python.org', ''),
224 ('http://python.org/#frag', 'http://python.org/', 'frag'),
225 ('http://python.org/', 'http://python.org/', ''),
226 ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
227 ('http://python.org/?q', 'http://python.org/?q', ''),
228 ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
229 ('http://python.org/p?q', 'http://python.org/p?q', ''),
230 (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
231 (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
232 ]:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000233 self.assertEqual(urllib.parse.urldefrag(url), (defrag, frag))
Fred Drake70705652002-10-16 21:02:36 +0000234
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000235 def test_urlsplit_attributes(self):
236 url = "HTTP://WWW.PYTHON.ORG/doc/#frag"
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000237 p = urllib.parse.urlsplit(url)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000238 self.assertEqual(p.scheme, "http")
239 self.assertEqual(p.netloc, "WWW.PYTHON.ORG")
240 self.assertEqual(p.path, "/doc/")
241 self.assertEqual(p.query, "")
242 self.assertEqual(p.fragment, "frag")
243 self.assertEqual(p.username, None)
244 self.assertEqual(p.password, None)
245 self.assertEqual(p.hostname, "www.python.org")
246 self.assertEqual(p.port, None)
247 # geturl() won't return exactly the original URL in this case
248 # since the scheme is always case-normalized
249 #self.assertEqual(p.geturl(), url)
250
251 url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag"
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000252 p = urllib.parse.urlsplit(url)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000253 self.assertEqual(p.scheme, "http")
254 self.assertEqual(p.netloc, "User:Pass@www.python.org:080")
255 self.assertEqual(p.path, "/doc/")
256 self.assertEqual(p.query, "query=yes")
257 self.assertEqual(p.fragment, "frag")
258 self.assertEqual(p.username, "User")
259 self.assertEqual(p.password, "Pass")
260 self.assertEqual(p.hostname, "www.python.org")
261 self.assertEqual(p.port, 80)
262 self.assertEqual(p.geturl(), url)
263
Christian Heimesfaf2f632008-01-06 16:59:19 +0000264 # Addressing issue1698, which suggests Username can contain
265 # "@" characters. Though not RFC compliant, many ftp sites allow
266 # and request email addresses as usernames.
267
268 url = "http://User@example.com:Pass@www.python.org:080/doc/?query=yes#frag"
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000269 p = urllib.parse.urlsplit(url)
Christian Heimesfaf2f632008-01-06 16:59:19 +0000270 self.assertEqual(p.scheme, "http")
271 self.assertEqual(p.netloc, "User@example.com:Pass@www.python.org:080")
272 self.assertEqual(p.path, "/doc/")
273 self.assertEqual(p.query, "query=yes")
274 self.assertEqual(p.fragment, "frag")
275 self.assertEqual(p.username, "User@example.com")
276 self.assertEqual(p.password, "Pass")
277 self.assertEqual(p.hostname, "www.python.org")
278 self.assertEqual(p.port, 80)
279 self.assertEqual(p.geturl(), url)
280
281
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000282 def test_attributes_bad_port(self):
283 """Check handling of non-integer ports."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000284 p = urllib.parse.urlsplit("http://www.example.net:foo")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 self.assertEqual(p.netloc, "www.example.net:foo")
286 self.assertRaises(ValueError, lambda: p.port)
287
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000288 p = urllib.parse.urlparse("http://www.example.net:foo")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 self.assertEqual(p.netloc, "www.example.net:foo")
290 self.assertRaises(ValueError, lambda: p.port)
291
292 def test_attributes_without_netloc(self):
293 # This example is straight from RFC 3261. It looks like it
294 # should allow the username, hostname, and port to be filled
295 # in, but doesn't. Since it's a URI and doesn't use the
296 # scheme://netloc syntax, the netloc and related attributes
297 # should be left empty.
298 uri = "sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15"
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000299 p = urllib.parse.urlsplit(uri)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000300 self.assertEqual(p.netloc, "")
301 self.assertEqual(p.username, None)
302 self.assertEqual(p.password, None)
303 self.assertEqual(p.hostname, None)
304 self.assertEqual(p.port, None)
305 self.assertEqual(p.geturl(), uri)
306
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000307 p = urllib.parse.urlparse(uri)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000308 self.assertEqual(p.netloc, "")
309 self.assertEqual(p.username, None)
310 self.assertEqual(p.password, None)
311 self.assertEqual(p.hostname, None)
312 self.assertEqual(p.port, None)
313 self.assertEqual(p.geturl(), uri)
314
Christian Heimesfaf2f632008-01-06 16:59:19 +0000315 def test_noslash(self):
316 # Issue 1637: http://foo.com?query is legal
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000317 self.assertEqual(urllib.parse.urlparse("http://example.com?blahblah=/foo"),
Christian Heimesfaf2f632008-01-06 16:59:19 +0000318 ('http', 'example.com', '', '', 'blahblah=/foo', ''))
319
Facundo Batista2ac5de22008-07-07 18:24:11 +0000320 def test_usingsys(self):
321 # Issue 3314: sys module is used in the error
322 self.assertRaises(TypeError, urllib.parse.urlencode, "foo")
323
Skip Montanaro6ec967d2002-03-23 05:32:10 +0000324def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000325 support.run_unittest(UrlParseTestCase)
Skip Montanaro6ec967d2002-03-23 05:32:10 +0000326
327if __name__ == "__main__":
328 test_main()