blob: 43d8c5d39eeb876f487e0bc03c1f4148f0df0eee [file] [log] [blame]
Martin v. Löwisa729daf2002-08-04 17:28:33 +00001# -*- coding: iso-8859-1 -*-
Guido van Rossuma831cac2000-03-10 23:23:21 +00002""" Test script for the Unicode implementation.
3
Guido van Rossuma831cac2000-03-10 23:23:21 +00004Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
Marc-André Lemburg36619082001-01-17 19:11:13 +00008"""#"
Christian Heimesc5f05e42008-02-23 17:40:11 +00009import sys, struct, codecs
Walter Dörwald0fd583c2003-02-21 12:53:50 +000010from test import test_support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000011
Neal Norwitz430f68b2005-11-24 22:00:56 +000012# Error handling (bad decoder return)
13def search_function(encoding):
14 def decode1(input, errors="strict"):
15 return 42 # not a tuple
16 def encode1(input, errors="strict"):
17 return 42 # not a tuple
18 def encode2(input, errors="strict"):
19 return (42, 42) # no unicode
20 def decode2(input, errors="strict"):
21 return (42, 42) # no unicode
22 if encoding=="test.unicode1":
23 return (encode1, decode1, None, None)
24 elif encoding=="test.unicode2":
25 return (encode2, decode2, None, None)
26 else:
27 return None
28codecs.register(search_function)
29
Walter Dörwald0fd583c2003-02-21 12:53:50 +000030class UnicodeTest(
31 string_tests.CommonTest,
Walter Dörwald57d88e52004-08-26 16:53:04 +000032 string_tests.MixinStrUnicodeUserStringTest,
33 string_tests.MixinStrUnicodeTest,
Walter Dörwald0fd583c2003-02-21 12:53:50 +000034 ):
35 type2test = unicode
36
37 def checkequalnofix(self, result, object, methodname, *args):
38 method = getattr(object, methodname)
39 realresult = method(*args)
40 self.assertEqual(realresult, result)
41 self.assert_(type(realresult) is type(result))
42
43 # if the original is returned make sure that
44 # this doesn't happen with subclasses
45 if realresult is object:
46 class usub(unicode):
47 def __repr__(self):
48 return 'usub(%r)' % unicode.__repr__(self)
49 object = usub(object)
50 method = getattr(object, methodname)
51 realresult = method(*args)
52 self.assertEqual(realresult, result)
53 self.assert_(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000054
Jeremy Hylton504de6b2003-10-06 05:08:26 +000055 def test_literals(self):
56 self.assertEqual(u'\xff', u'\u00ff')
57 self.assertEqual(u'\uffff', u'\U0000ffff')
Kurt B. Kaiserdb98f362007-07-18 19:58:42 +000058 self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'')
59 self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'')
60 self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000)
Jeremy Hylton504de6b2003-10-06 05:08:26 +000061
Walter Dörwald28256f22003-01-19 16:59:20 +000062 def test_repr(self):
63 if not sys.platform.startswith('java'):
64 # Test basic sanity of repr()
65 self.assertEqual(repr(u'abc'), "u'abc'")
66 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
67 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
68 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
69 self.assertEqual(repr(u'\\'), "u'\\\\'")
70 self.assertEqual(repr(u'\n'), "u'\\n'")
71 self.assertEqual(repr(u'\r'), "u'\\r'")
72 self.assertEqual(repr(u'\t'), "u'\\t'")
73 self.assertEqual(repr(u'\b'), "u'\\x08'")
74 self.assertEqual(repr(u"'\""), """u'\\'"'""")
75 self.assertEqual(repr(u"'\""), """u'\\'"'""")
76 self.assertEqual(repr(u"'"), '''u"'"''')
77 self.assertEqual(repr(u'"'), """u'"'""")
78 latin1repr = (
79 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
80 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
81 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
82 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
83 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
84 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
85 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
86 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
87 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
88 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
89 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
90 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
91 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
92 "\\xfe\\xff'")
93 testrepr = repr(u''.join(map(unichr, xrange(256))))
94 self.assertEqual(testrepr, latin1repr)
Neal Norwitz17753ec2006-08-21 22:21:19 +000095 # Test repr works on wide unicode escapes without overflow.
96 self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
97 repr(u"\U00010000" * 39 + u"\uffff" * 4096))
98
Walter Dörwald28256f22003-01-19 16:59:20 +000099
Walter Dörwald28256f22003-01-19 16:59:20 +0000100 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000101 string_tests.CommonTest.test_count(self)
102 # check mixed argument types
103 self.checkequalnofix(3, 'aaa', 'count', u'a')
104 self.checkequalnofix(0, 'aaa', 'count', u'b')
105 self.checkequalnofix(3, u'aaa', 'count', 'a')
106 self.checkequalnofix(0, u'aaa', 'count', 'b')
107 self.checkequalnofix(0, u'aaa', 'count', 'b')
108 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
109 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
110 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
111 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000112
Walter Dörwald28256f22003-01-19 16:59:20 +0000113 def test_find(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000114 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
115 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
116 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000117
Walter Dörwald28256f22003-01-19 16:59:20 +0000118 self.assertRaises(TypeError, u'hello'.find)
119 self.assertRaises(TypeError, u'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000120
Walter Dörwald28256f22003-01-19 16:59:20 +0000121 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000122 string_tests.CommonTest.test_rfind(self)
123 # check mixed argument types
124 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
125 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
126 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000127
Walter Dörwald28256f22003-01-19 16:59:20 +0000128 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000129 string_tests.CommonTest.test_index(self)
130 # check mixed argument types
131 for (t1, t2) in ((str, unicode), (unicode, str)):
132 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
133 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
134 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
135 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
136 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
137 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
138 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
139 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000140
Walter Dörwald28256f22003-01-19 16:59:20 +0000141 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000142 string_tests.CommonTest.test_rindex(self)
143 # check mixed argument types
144 for (t1, t2) in ((str, unicode), (unicode, str)):
145 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
146 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
147 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
148 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000149
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000150 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
151 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
152 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
153 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
154 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000155
Walter Dörwald28256f22003-01-19 16:59:20 +0000156 def test_translate(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000157 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
158 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
159 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
160 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
161 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
Walter Dörwaldcd736e72004-02-05 17:36:00 +0000162 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000163
Walter Dörwald28256f22003-01-19 16:59:20 +0000164 self.assertRaises(TypeError, u'hello'.translate)
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000165 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000166
Walter Dörwald28256f22003-01-19 16:59:20 +0000167 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000168 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000169
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000170 # Mixed arguments
171 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
172 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
173 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000174
Walter Dörwald28256f22003-01-19 16:59:20 +0000175 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000176 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000177
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000178 # mixed arguments
179 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
180 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
181 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
182 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
183 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
184 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
185 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
Marc-André Lemburge5034372000-08-08 08:04:29 +0000186
Walter Dörwald28256f22003-01-19 16:59:20 +0000187 def test_strip(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000188 string_tests.CommonTest.test_strip(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000189 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000190
Walter Dörwald28256f22003-01-19 16:59:20 +0000191 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000192 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000193
Walter Dörwald28256f22003-01-19 16:59:20 +0000194 # method call forwarded from str implementation because of unicode argument
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000195 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
Walter Dörwald28256f22003-01-19 16:59:20 +0000196 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000197
Walter Dörwald28256f22003-01-19 16:59:20 +0000198 def test_comparison(self):
199 # Comparisons:
200 self.assertEqual(u'abc', 'abc')
201 self.assertEqual('abc', u'abc')
202 self.assertEqual(u'abc', u'abc')
203 self.assert_(u'abcd' > 'abc')
204 self.assert_('abcd' > u'abc')
205 self.assert_(u'abcd' > u'abc')
206 self.assert_(u'abc' < 'abcd')
207 self.assert_('abc' < u'abcd')
208 self.assert_(u'abc' < u'abcd')
209
210 if 0:
211 # Move these tests to a Unicode collation module test...
212 # Testing UTF-16 code point order comparisons...
213
214 # No surrogates, no fixup required.
215 self.assert_(u'\u0061' < u'\u20ac')
216 # Non surrogate below surrogate value, no fixup required
217 self.assert_(u'\u0061' < u'\ud800\udc02')
218
219 # Non surrogate above surrogate value, fixup required
220 def test_lecmp(s, s2):
221 self.assert_(s < s2)
222
223 def test_fixup(s):
224 s2 = u'\ud800\udc01'
225 test_lecmp(s, s2)
226 s2 = u'\ud900\udc01'
227 test_lecmp(s, s2)
228 s2 = u'\uda00\udc01'
229 test_lecmp(s, s2)
230 s2 = u'\udb00\udc01'
231 test_lecmp(s, s2)
232 s2 = u'\ud800\udd01'
233 test_lecmp(s, s2)
234 s2 = u'\ud900\udd01'
235 test_lecmp(s, s2)
236 s2 = u'\uda00\udd01'
237 test_lecmp(s, s2)
238 s2 = u'\udb00\udd01'
239 test_lecmp(s, s2)
240 s2 = u'\ud800\ude01'
241 test_lecmp(s, s2)
242 s2 = u'\ud900\ude01'
243 test_lecmp(s, s2)
244 s2 = u'\uda00\ude01'
245 test_lecmp(s, s2)
246 s2 = u'\udb00\ude01'
247 test_lecmp(s, s2)
248 s2 = u'\ud800\udfff'
249 test_lecmp(s, s2)
250 s2 = u'\ud900\udfff'
251 test_lecmp(s, s2)
252 s2 = u'\uda00\udfff'
253 test_lecmp(s, s2)
254 s2 = u'\udb00\udfff'
255 test_lecmp(s, s2)
256
257 test_fixup(u'\ue000')
258 test_fixup(u'\uff61')
259
260 # Surrogates on both sides, no fixup required
261 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
262
Walter Dörwald28256f22003-01-19 16:59:20 +0000263 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000264 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
265 self.checkequalnofix(False, u'\u1FFc', 'islower')
Walter Dörwald28256f22003-01-19 16:59:20 +0000266
267 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000268 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
269 if not sys.platform.startswith('java'):
270 self.checkequalnofix(False, u'\u1FFc', 'isupper')
Walter Dörwald28256f22003-01-19 16:59:20 +0000271
272 def test_istitle(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000273 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
274 self.checkequalnofix(True, u'\u1FFc', 'istitle')
275 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000276
277 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000278 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
279 self.checkequalnofix(True, u'\u2000', 'isspace')
280 self.checkequalnofix(True, u'\u200a', 'isspace')
281 self.checkequalnofix(False, u'\u2014', 'isspace')
Walter Dörwald28256f22003-01-19 16:59:20 +0000282
283 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000284 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
285 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
Walter Dörwald28256f22003-01-19 16:59:20 +0000286
287 def test_isdecimal(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000288 self.checkequalnofix(False, u'', 'isdecimal')
289 self.checkequalnofix(False, u'a', 'isdecimal')
290 self.checkequalnofix(True, u'0', 'isdecimal')
291 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
292 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
293 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
294 self.checkequalnofix(True, u'0123456789', 'isdecimal')
295 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000296
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000297 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000298
299 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000300 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
301 self.checkequalnofix(True, u'\u2460', 'isdigit')
302 self.checkequalnofix(False, u'\xbc', 'isdigit')
303 self.checkequalnofix(True, u'\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000304
305 def test_isnumeric(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000306 self.checkequalnofix(False, u'', 'isnumeric')
307 self.checkequalnofix(False, u'a', 'isnumeric')
308 self.checkequalnofix(True, u'0', 'isnumeric')
309 self.checkequalnofix(True, u'\u2460', 'isnumeric')
310 self.checkequalnofix(True, u'\xbc', 'isnumeric')
311 self.checkequalnofix(True, u'\u0660', 'isnumeric')
312 self.checkequalnofix(True, u'0123456789', 'isnumeric')
313 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000314
315 self.assertRaises(TypeError, u"abc".isnumeric, 42)
316
Walter Dörwald28256f22003-01-19 16:59:20 +0000317 def test_contains(self):
318 # Testing Unicode contains method
319 self.assert_('a' in u'abdb')
320 self.assert_('a' in u'bdab')
321 self.assert_('a' in u'bdaba')
322 self.assert_('a' in u'bdba')
323 self.assert_('a' in u'bdba')
324 self.assert_(u'a' in u'bdba')
325 self.assert_(u'a' not in u'bdb')
326 self.assert_(u'a' not in 'bdb')
327 self.assert_(u'a' in 'bdba')
328 self.assert_(u'a' in ('a',1,None))
329 self.assert_(u'a' in (1,None,'a'))
330 self.assert_(u'a' in (1,None,u'a'))
331 self.assert_('a' in ('a',1,None))
332 self.assert_('a' in (1,None,'a'))
333 self.assert_('a' in (1,None,u'a'))
334 self.assert_('a' not in ('x',1,u'y'))
335 self.assert_('a' not in ('x',1,None))
336 self.assert_(u'abcd' not in u'abcxxxx')
337 self.assert_(u'ab' in u'abcd')
338 self.assert_('ab' in u'abc')
339 self.assert_(u'ab' in 'abc')
340 self.assert_(u'ab' in (1,None,u'ab'))
341 self.assert_(u'' in u'abc')
342 self.assert_('' in u'abc')
343
344 # If the following fails either
345 # the contains operator does not propagate UnicodeErrors or
346 # someone has changed the default encoding
347 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
348
349 self.assert_(u'' in '')
350 self.assert_('' in u'')
351 self.assert_(u'' in u'')
352 self.assert_(u'' in 'abc')
353 self.assert_('' in u'abc')
354 self.assert_(u'' in u'abc')
355 self.assert_(u'\0' not in 'abc')
356 self.assert_('\0' not in u'abc')
357 self.assert_(u'\0' not in u'abc')
358 self.assert_(u'\0' in '\0abc')
359 self.assert_('\0' in u'\0abc')
360 self.assert_(u'\0' in u'\0abc')
361 self.assert_(u'\0' in 'abc\0')
362 self.assert_('\0' in u'abc\0')
363 self.assert_(u'\0' in u'abc\0')
364 self.assert_(u'a' in '\0abc')
365 self.assert_('a' in u'\0abc')
366 self.assert_(u'a' in u'\0abc')
367 self.assert_(u'asdf' in 'asdf')
368 self.assert_('asdf' in u'asdf')
369 self.assert_(u'asdf' in u'asdf')
370 self.assert_(u'asdf' not in 'asd')
371 self.assert_('asdf' not in u'asd')
372 self.assert_(u'asdf' not in u'asd')
373 self.assert_(u'asdf' not in '')
374 self.assert_('asdf' not in u'')
375 self.assert_(u'asdf' not in u'')
376
377 self.assertRaises(TypeError, u"abc".__contains__)
378
379 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000380 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000381 # Testing Unicode formatting strings...
382 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
383 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
384 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
385 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
386 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
387 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000388 if not sys.platform.startswith('java'):
389 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
390 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000391 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000392
Walter Dörwald43440a62003-03-31 18:07:50 +0000393 self.assertEqual(u'%c' % 0x1234, u'\u1234')
Walter Dörwald44f527f2003-04-02 16:37:24 +0000394 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
Walter Dörwald28256f22003-01-19 16:59:20 +0000395
Victor Stinnerf7270ba2010-02-23 23:20:14 +0000396 for num in range(0x00,0x80):
397 char = chr(num)
398 self.assertEqual(u"%c" % char, char)
399 self.assertEqual(u"%c" % num, char)
400 # Issue 7649
401 for num in range(0x80,0x100):
402 uchar = unichr(num)
403 self.assertEqual(uchar, u"%c" % num) # works only with ints
404 self.assertEqual(uchar, u"%c" % uchar) # and unicode chars
405 # the implicit decoding should fail for non-ascii chars
406 self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num))
407 self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num))
408
Walter Dörwald28256f22003-01-19 16:59:20 +0000409 # formatting jobs delegated from the string implementation:
410 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
411 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
412 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
413 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
414 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
415 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
416 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
417 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
418 self.assertEqual('...%s...' % u"abc", u'...abc...')
419 self.assertEqual('%*s' % (5,u'abc',), u' abc')
420 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
421 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
422 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
423 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000424 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
Walter Dörwald43440a62003-03-31 18:07:50 +0000425 self.assertEqual('%c' % u'a', u'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000426 class Wrapper:
427 def __str__(self):
428 return u'\u1234'
429 self.assertEqual('%s' % Wrapper(), u'\u1234')
Tim Peters4511a712006-05-03 04:46:14 +0000430
Georg Brandlde9b6242006-04-30 11:13:56 +0000431 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000432 def test_format_float(self):
Georg Brandlde9b6242006-04-30 11:13:56 +0000433 # should not format with a comma, but always with C locale
434 self.assertEqual(u'1.0', u'%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +0000435
Walter Dörwald28256f22003-01-19 16:59:20 +0000436 def test_constructor(self):
437 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
438
439 self.assertEqual(
440 unicode(u'unicode remains unicode'),
441 u'unicode remains unicode'
442 )
443
444 class UnicodeSubclass(unicode):
Marc-André Lemburg79f57832002-12-29 19:44:06 +0000445 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +0000446
Walter Dörwald28256f22003-01-19 16:59:20 +0000447 self.assertEqual(
448 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
449 u'unicode subclass becomes unicode'
450 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000451
Walter Dörwald28256f22003-01-19 16:59:20 +0000452 self.assertEqual(
453 unicode('strings are converted to unicode'),
454 u'strings are converted to unicode'
455 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000456
Walter Dörwald28256f22003-01-19 16:59:20 +0000457 class UnicodeCompat:
458 def __init__(self, x):
459 self.x = x
460 def __unicode__(self):
461 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000462
Walter Dörwald28256f22003-01-19 16:59:20 +0000463 self.assertEqual(
464 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
465 u'__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000466
Walter Dörwald28256f22003-01-19 16:59:20 +0000467 class StringCompat:
468 def __init__(self, x):
469 self.x = x
470 def __str__(self):
471 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000472
Walter Dörwald28256f22003-01-19 16:59:20 +0000473 self.assertEqual(
474 unicode(StringCompat('__str__ compatible objects are recognized')),
475 u'__str__ compatible objects are recognized'
476 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000477
Walter Dörwald28256f22003-01-19 16:59:20 +0000478 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000479
Walter Dörwald28256f22003-01-19 16:59:20 +0000480 o = StringCompat('unicode(obj) is compatible to str()')
481 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
482 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000483
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000484 # %-formatting and .__unicode__()
485 self.assertEqual(u'%s' %
486 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
487 u"u'%s' % obj uses obj.__unicode__()")
488 self.assertEqual(u'%s' %
489 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
490 u"u'%s' % obj falls back to obj.__str__()")
491
Walter Dörwald28256f22003-01-19 16:59:20 +0000492 for obj in (123, 123.45, 123L):
493 self.assertEqual(unicode(obj), unicode(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000494
Walter Dörwald28256f22003-01-19 16:59:20 +0000495 # unicode(obj, encoding, error) tests (this maps to
496 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000497
Walter Dörwald28256f22003-01-19 16:59:20 +0000498 if not sys.platform.startswith('java'):
499 self.assertRaises(
500 TypeError,
501 unicode,
502 u'decoding unicode is not supported',
503 'utf-8',
504 'strict'
505 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000506
Walter Dörwald28256f22003-01-19 16:59:20 +0000507 self.assertEqual(
508 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
509 u'strings are decoded to unicode'
510 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000511
Walter Dörwald28256f22003-01-19 16:59:20 +0000512 if not sys.platform.startswith('java'):
513 self.assertEqual(
514 unicode(
515 buffer('character buffers are decoded to unicode'),
516 'utf-8',
517 'strict'
518 ),
519 u'character buffers are decoded to unicode'
520 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521
Walter Dörwald28256f22003-01-19 16:59:20 +0000522 self.assertRaises(TypeError, unicode, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000523
Walter Dörwald28256f22003-01-19 16:59:20 +0000524 def test_codecs_utf7(self):
525 utfTests = [
526 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
527 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
528 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
529 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
530 (u'+', '+-'),
531 (u'+-', '+--'),
532 (u'+?', '+-?'),
533 (u'\?', '+AFw?'),
534 (u'+?', '+-?'),
535 (ur'\\?', '+AFwAXA?'),
536 (ur'\\\?', '+AFwAXABc?'),
537 (ur'++--', '+-+---')
538 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000539
Walter Dörwald28256f22003-01-19 16:59:20 +0000540 for (x, y) in utfTests:
541 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000542
Walter Dörwald28256f22003-01-19 16:59:20 +0000543 # surrogates not supported
544 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000545
Walter Dörwald28256f22003-01-19 16:59:20 +0000546 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000547
Antoine Pitrou4982d5d2008-07-25 17:45:59 +0000548 # Issue #2242: crash on some Windows/MSVC versions
549 self.assertRaises(UnicodeDecodeError, '+\xc1'.decode, 'utf-7')
550
Walter Dörwald28256f22003-01-19 16:59:20 +0000551 def test_codecs_utf8(self):
552 self.assertEqual(u''.encode('utf-8'), '')
553 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
554 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
555 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
556 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
557 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
558 self.assertEqual(
559 (u'\ud800\udc02'*1000).encode('utf-8'),
560 '\xf0\x90\x80\x82'*1000
561 )
562 self.assertEqual(
563 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
564 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
565 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
566 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
567 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
568 u' Nunstuck git und'.encode('utf-8'),
569 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
570 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
571 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
572 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
573 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
574 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
575 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
576 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
577 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
578 '\xe3\x80\x8cWenn ist das Nunstuck git und'
579 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000580
Walter Dörwald28256f22003-01-19 16:59:20 +0000581 # UTF-8 specific decoding tests
582 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
583 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
584 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000585
Walter Dörwald28256f22003-01-19 16:59:20 +0000586 # Other possible utf-8 test cases:
587 # * strict decoding testing for all of the
588 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000589
Ezio Melotti86e5e172010-07-03 05:34:39 +0000590 def test_utf8_decode_valid_sequences(self):
591 sequences = [
592 # single byte
593 ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
594 # 2 bytes
595 ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
596 # 3 bytes
597 ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
598 ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
599 # 4 bytes
600 ('\xF0\x90\x80\x80', u'\U00010000'),
601 ('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
602 ]
603 for seq, res in sequences:
604 self.assertEqual(seq.decode('utf-8'), res)
605
606 for ch in map(unichr, range(0, sys.maxunicode)):
607 self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
608
609 def test_utf8_decode_invalid_sequences(self):
610 # continuation bytes in a sequence of 2, 3, or 4 bytes
611 continuation_bytes = map(chr, range(0x80, 0xC0))
612 # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
613 invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
614 # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
615 invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
616 invalid_start_bytes = (
617 continuation_bytes + invalid_2B_seq_start_bytes +
618 invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
619 )
620
621 for byte in invalid_start_bytes:
622 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
623
624 for sb in invalid_2B_seq_start_bytes:
625 for cb in continuation_bytes:
626 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
627
628 for sb in invalid_4B_seq_start_bytes:
629 for cb1 in continuation_bytes[:3]:
630 for cb3 in continuation_bytes[:3]:
631 self.assertRaises(UnicodeDecodeError,
632 (sb+cb1+'\x80'+cb3).decode, 'utf-8')
633
634 for cb in map(chr, range(0x80, 0xA0)):
635 self.assertRaises(UnicodeDecodeError,
636 ('\xE0'+cb+'\x80').decode, 'utf-8')
637 self.assertRaises(UnicodeDecodeError,
638 ('\xE0'+cb+'\xBF').decode, 'utf-8')
639 # XXX: surrogates shouldn't be valid UTF-8!
640 # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
641 # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
642 #for cb in map(chr, range(0xA0, 0xC0)):
643 #sys.__stdout__.write('\\xED\\x%02x\\x80\n' % ord(cb))
644 #self.assertRaises(UnicodeDecodeError,
645 #('\xED'+cb+'\x80').decode, 'utf-8')
646 #self.assertRaises(UnicodeDecodeError,
647 #('\xED'+cb+'\xBF').decode, 'utf-8')
648 for cb in map(chr, range(0x80, 0x90)):
649 self.assertRaises(UnicodeDecodeError,
650 ('\xF0'+cb+'\x80\x80').decode, 'utf-8')
651 self.assertRaises(UnicodeDecodeError,
652 ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
653 for cb in map(chr, range(0x90, 0xC0)):
654 self.assertRaises(UnicodeDecodeError,
655 ('\xF4'+cb+'\x80\x80').decode, 'utf-8')
656 self.assertRaises(UnicodeDecodeError,
657 ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
658
659 def test_issue8271(self):
660 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
661 # only the start byte and the continuation byte(s) are now considered
662 # invalid, instead of the number of bytes specified by the start byte.
663 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
664 # table 3-8, Row 2) for more information about the algorithm used.
665 FFFD = u'\ufffd'
666 sequences = [
667 # invalid start bytes
668 ('\x80', FFFD), # continuation byte
669 ('\x80\x80', FFFD*2), # 2 continuation bytes
670 ('\xc0', FFFD),
671 ('\xc0\xc0', FFFD*2),
672 ('\xc1', FFFD),
673 ('\xc1\xc0', FFFD*2),
674 ('\xc0\xc1', FFFD*2),
675 # with start byte of a 2-byte sequence
676 ('\xc2', FFFD), # only the start byte
677 ('\xc2\xc2', FFFD*2), # 2 start bytes
678 ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
679 ('\xc2\x41', FFFD+'A'), # invalid continuation byte
680 # with start byte of a 3-byte sequence
681 ('\xe1', FFFD), # only the start byte
682 ('\xe1\xe1', FFFD*2), # 2 start bytes
683 ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
684 ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
685 ('\xe1\x80', FFFD), # only 1 continuation byte
686 ('\xe1\x41', FFFD+'A'), # invalid continuation byte
687 ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
688 ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
689 ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
690 ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
691 ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
692 # with start byte of a 4-byte sequence
693 ('\xf1', FFFD), # only the start byte
694 ('\xf1\xf1', FFFD*2), # 2 start bytes
695 ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
696 ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
697 ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
698 ('\xf1\x80', FFFD), # only 1 continuation bytes
699 ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
700 ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
701 ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
702 ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
703 ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
704 ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
705 ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
706 ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
707 ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
708 ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
709 ('\xf1\xf1\x80\x41', FFFD*2+'A'),
710 ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
711 # with invalid start byte of a 4-byte sequence (rfc2279)
712 ('\xf5', FFFD), # only the start byte
713 ('\xf5\xf5', FFFD*2), # 2 start bytes
714 ('\xf5\x80', FFFD*2), # only 1 continuation byte
715 ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
716 ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
717 ('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
718 ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
719 ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
720 # with invalid start byte of a 5-byte sequence (rfc2279)
721 ('\xf8', FFFD), # only the start byte
722 ('\xf8\xf8', FFFD*2), # 2 start bytes
723 ('\xf8\x80', FFFD*2), # only one continuation byte
724 ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
725 ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
726 # with invalid start byte of a 6-byte sequence (rfc2279)
727 ('\xfc', FFFD), # only the start byte
728 ('\xfc\xfc', FFFD*2), # 2 start bytes
729 ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
730 ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
731 # invalid start byte
732 ('\xfe', FFFD),
733 ('\xfe\x80\x80', FFFD*3),
734 # other sequences
735 ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
736 ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
737 ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
738 ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
739 u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
740 ]
741 for n, (seq, res) in enumerate(sequences):
742 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
743 self.assertEqual(seq.decode('utf-8', 'replace'), res)
744 self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
745 self.assertEqual(seq.decode('utf-8', 'ignore'),
746 res.replace(u'\uFFFD', ''))
747
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000748 def test_codecs_idna(self):
749 # Test whether trailing dot is preserved
750 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
751
Walter Dörwald28256f22003-01-19 16:59:20 +0000752 def test_codecs_errors(self):
753 # Error handling (encoding)
754 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
755 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
756 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
757 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000758
Walter Dörwald28256f22003-01-19 16:59:20 +0000759 # Error handling (decoding)
760 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
761 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
762 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
763 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000764
Walter Dörwald28256f22003-01-19 16:59:20 +0000765 # Error handling (unknown character names)
766 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000767
Walter Dörwald28256f22003-01-19 16:59:20 +0000768 # Error handling (truncated escape sequence)
769 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000770
Walter Dörwald28256f22003-01-19 16:59:20 +0000771 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
772 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
773 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
774 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
775 # executes PyUnicode_Encode()
776 import imp
777 self.assertRaises(
778 ImportError,
779 imp.find_module,
780 "non-existing module",
781 [u"non-existing dir"]
782 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000783
Walter Dörwald28256f22003-01-19 16:59:20 +0000784 # Error handling (wrong arguments)
785 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000786
Walter Dörwald28256f22003-01-19 16:59:20 +0000787 # Error handling (PyUnicode_EncodeDecimal())
788 self.assertRaises(UnicodeError, int, u"\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +0000789
Walter Dörwald28256f22003-01-19 16:59:20 +0000790 def test_codecs(self):
791 # Encoding
792 self.assertEqual(u'hello'.encode('ascii'), 'hello')
793 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
794 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
795 self.assertEqual(u'hello'.encode('utf8'), 'hello')
796 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
797 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
798 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +0000799
Walter Dörwald28256f22003-01-19 16:59:20 +0000800 # Roundtrip safety for BMP (just the first 1024 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000801 for c in xrange(1024):
802 u = unichr(c)
803 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
804 'utf-16-be', 'raw_unicode_escape',
805 'unicode_escape', 'unicode_internal'):
806 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +0000807
Walter Dörwald28256f22003-01-19 16:59:20 +0000808 # Roundtrip safety for BMP (just the first 256 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000809 for c in xrange(256):
810 u = unichr(c)
811 for encoding in ('latin-1',):
812 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000813
Walter Dörwald28256f22003-01-19 16:59:20 +0000814 # Roundtrip safety for BMP (just the first 128 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000815 for c in xrange(128):
816 u = unichr(c)
817 for encoding in ('ascii',):
818 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000819
Walter Dörwald28256f22003-01-19 16:59:20 +0000820 # Roundtrip safety for non-BMP (just a few chars)
821 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
822 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
823 #'raw_unicode_escape',
824 'unicode_escape', 'unicode_internal'):
825 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000826
Walter Dörwald28256f22003-01-19 16:59:20 +0000827 # UTF-8 must be roundtrip safe for all UCS-2 code points
828 # This excludes surrogates: in the full range, there would be
829 # a surrogate pair (\udbff\udc00), which gets converted back
830 # to a non-BMP character (\U0010fc00)
831 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
832 for encoding in ('utf-8',):
833 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000834
Walter Dörwald28256f22003-01-19 16:59:20 +0000835 def test_codecs_charmap(self):
836 # 0-127
837 s = ''.join(map(chr, xrange(128)))
838 for encoding in (
839 'cp037', 'cp1026',
840 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
841 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
842 'cp863', 'cp865', 'cp866',
843 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
844 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
845 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
846 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000847
Walter Dörwald28256f22003-01-19 16:59:20 +0000848 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
849 'cp1256', 'cp1257', 'cp1258',
850 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000851
Walter Dörwald28256f22003-01-19 16:59:20 +0000852 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
853 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000854
Walter Dörwald28256f22003-01-19 16:59:20 +0000855 ### These have undefined mappings:
856 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000857
Walter Dörwald28256f22003-01-19 16:59:20 +0000858 ### These fail the round-trip:
859 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +0000860
Walter Dörwald28256f22003-01-19 16:59:20 +0000861 ):
862 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000863
Walter Dörwald28256f22003-01-19 16:59:20 +0000864 # 128-255
865 s = ''.join(map(chr, xrange(128, 256)))
866 for encoding in (
867 'cp037', 'cp1026',
868 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
869 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
870 'cp863', 'cp865', 'cp866',
871 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
872 'iso8859_2', 'iso8859_4', 'iso8859_5',
873 'iso8859_9', 'koi8_r', 'latin_1',
874 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000875
Walter Dörwald28256f22003-01-19 16:59:20 +0000876 ### These have undefined mappings:
877 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
878 #'cp1256', 'cp1257', 'cp1258',
879 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
880 #'iso8859_3', 'iso8859_6', 'iso8859_7',
881 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000882
Walter Dörwald28256f22003-01-19 16:59:20 +0000883 ### These fail the round-trip:
884 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +0000885
Walter Dörwald28256f22003-01-19 16:59:20 +0000886 ):
887 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000888
Walter Dörwald28256f22003-01-19 16:59:20 +0000889 def test_concatenation(self):
890 self.assertEqual((u"abc" u"def"), u"abcdef")
891 self.assertEqual(("abc" u"def"), u"abcdef")
892 self.assertEqual((u"abc" "def"), u"abcdef")
893 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
894 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +0000895
Walter Dörwald28256f22003-01-19 16:59:20 +0000896 def test_printing(self):
897 class BitBucket:
898 def write(self, text):
899 pass
Fred Drake004d5e62000-10-23 17:22:08 +0000900
Walter Dörwald28256f22003-01-19 16:59:20 +0000901 out = BitBucket()
902 print >>out, u'abc'
903 print >>out, u'abc', u'def'
904 print >>out, u'abc', 'def'
905 print >>out, 'abc', u'def'
906 print >>out, u'abc\n'
907 print >>out, u'abc\n',
908 print >>out, u'abc\n',
909 print >>out, u'def\n'
910 print >>out, u'def\n'
Fred Drake004d5e62000-10-23 17:22:08 +0000911
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +0000912 def test_ucs4(self):
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +0000913 x = u'\U00100000'
914 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
915 self.assertEqual(x, y)
916
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +0000917 y = r'\U00100000'
918 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
919 self.assertEqual(x, y)
920 y = r'\U00010000'
921 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
922 self.assertEqual(x, y)
923
924 try:
925 '\U11111111'.decode("raw-unicode-escape")
926 except UnicodeDecodeError as e:
927 self.assertEqual(e.start, 0)
928 self.assertEqual(e.end, 10)
929 else:
930 self.fail("Should have raised UnicodeDecodeError")
931
Brett Cannonc3647ac2005-04-26 03:45:26 +0000932 def test_conversion(self):
933 # Make sure __unicode__() works properly
934 class Foo0:
935 def __str__(self):
936 return "foo"
937
938 class Foo1:
939 def __unicode__(self):
940 return u"foo"
941
942 class Foo2(object):
943 def __unicode__(self):
944 return u"foo"
945
946 class Foo3(object):
947 def __unicode__(self):
948 return "foo"
949
950 class Foo4(str):
951 def __unicode__(self):
952 return "foo"
953
954 class Foo5(unicode):
955 def __unicode__(self):
956 return "foo"
957
958 class Foo6(str):
959 def __str__(self):
960 return "foos"
961
962 def __unicode__(self):
963 return u"foou"
964
965 class Foo7(unicode):
966 def __str__(self):
967 return "foos"
968 def __unicode__(self):
969 return u"foou"
970
971 class Foo8(unicode):
972 def __new__(cls, content=""):
973 return unicode.__new__(cls, 2*content)
974 def __unicode__(self):
975 return self
976
977 class Foo9(unicode):
978 def __str__(self):
979 return "string"
980 def __unicode__(self):
981 return "not unicode"
982
983 self.assertEqual(unicode(Foo0()), u"foo")
984 self.assertEqual(unicode(Foo1()), u"foo")
985 self.assertEqual(unicode(Foo2()), u"foo")
986 self.assertEqual(unicode(Foo3()), u"foo")
987 self.assertEqual(unicode(Foo4("bar")), u"foo")
988 self.assertEqual(unicode(Foo5("bar")), u"foo")
989 self.assertEqual(unicode(Foo6("bar")), u"foou")
990 self.assertEqual(unicode(Foo7("bar")), u"foou")
991 self.assertEqual(unicode(Foo8("foo")), u"foofoo")
992 self.assertEqual(str(Foo9("foo")), "string")
993 self.assertEqual(unicode(Foo9("foo")), u"not unicode")
994
Anthony Baxter67b6d512006-03-30 10:54:07 +0000995 def test_unicode_repr(self):
996 class s1:
997 def __repr__(self):
998 return '\\n'
999
1000 class s2:
1001 def __repr__(self):
1002 return u'\\n'
1003
1004 self.assertEqual(repr(s1()), '\\n')
1005 self.assertEqual(repr(s2()), '\\n')
1006
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00001007 def test_expandtabs_overflows_gracefully(self):
1008 # This test only affects 32-bit platforms because expandtabs can only take
1009 # an int as the max value, not a 64-bit C long. If expandtabs is changed
1010 # to take a 64-bit long, this test should apply to all platforms.
Neal Norwitzba965de2007-06-11 02:14:39 +00001011 if sys.maxint > (1 << 32) or struct.calcsize('P') != 4:
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00001012 return
1013 self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
Anthony Baxter67b6d512006-03-30 10:54:07 +00001014
Eric Smitha9f7d622008-02-17 19:46:49 +00001015 def test__format__(self):
1016 def test(value, format, expected):
1017 # test both with and without the trailing 's'
1018 self.assertEqual(value.__format__(format), expected)
1019 self.assertEqual(value.__format__(format + u's'), expected)
1020
1021 test(u'', u'', u'')
1022 test(u'abc', u'', u'abc')
1023 test(u'abc', u'.3', u'abc')
1024 test(u'ab', u'.3', u'ab')
1025 test(u'abcdef', u'.3', u'abc')
1026 test(u'abcdef', u'.0', u'')
1027 test(u'abc', u'3.3', u'abc')
1028 test(u'abc', u'2.3', u'abc')
1029 test(u'abc', u'2.2', u'ab')
1030 test(u'abc', u'3.2', u'ab ')
1031 test(u'result', u'x<0', u'result')
1032 test(u'result', u'x<5', u'result')
1033 test(u'result', u'x<6', u'result')
1034 test(u'result', u'x<7', u'resultx')
1035 test(u'result', u'x<8', u'resultxx')
1036 test(u'result', u' <7', u'result ')
1037 test(u'result', u'<7', u'result ')
1038 test(u'result', u'>7', u' result')
1039 test(u'result', u'>8', u' result')
1040 test(u'result', u'^8', u' result ')
1041 test(u'result', u'^9', u' result ')
1042 test(u'result', u'^10', u' result ')
1043 test(u'a', u'10000', u'a' + u' ' * 9999)
1044 test(u'', u'10000', u' ' * 10000)
1045 test(u'', u'10000000', u' ' * 10000000)
1046
1047 # test mixing unicode and str
1048 self.assertEqual(u'abc'.__format__('s'), u'abc')
1049 self.assertEqual(u'abc'.__format__('->10s'), u'-------abc')
1050
1051 def test_format(self):
1052 self.assertEqual(u''.format(), u'')
1053 self.assertEqual(u'a'.format(), u'a')
1054 self.assertEqual(u'ab'.format(), u'ab')
1055 self.assertEqual(u'a{{'.format(), u'a{')
1056 self.assertEqual(u'a}}'.format(), u'a}')
1057 self.assertEqual(u'{{b'.format(), u'{b')
1058 self.assertEqual(u'}}b'.format(), u'}b')
1059 self.assertEqual(u'a{{b'.format(), u'a{b')
1060
1061 # examples from the PEP:
1062 import datetime
1063 self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred")
1064 self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')),
1065 u"My name is Fred")
1066 self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'),
1067 u"My name is Fred :-{}")
1068
1069 # datetime.__format__ doesn't work with unicode
1070 #d = datetime.date(2007, 8, 18)
1071 #self.assertEqual("The year is {0.year}".format(d),
1072 # "The year is 2007")
1073
1074 # classes we'll use for testing
1075 class C:
1076 def __init__(self, x=100):
1077 self._x = x
1078 def __format__(self, spec):
1079 return spec
1080
1081 class D:
1082 def __init__(self, x):
1083 self.x = x
1084 def __format__(self, spec):
1085 return str(self.x)
1086
1087 # class with __str__, but no __format__
1088 class E:
1089 def __init__(self, x):
1090 self.x = x
1091 def __str__(self):
1092 return u'E(' + self.x + u')'
1093
1094 # class with __repr__, but no __format__ or __str__
1095 class F:
1096 def __init__(self, x):
1097 self.x = x
1098 def __repr__(self):
1099 return u'F(' + self.x + u')'
1100
1101 # class with __format__ that forwards to string, for some format_spec's
1102 class G:
1103 def __init__(self, x):
1104 self.x = x
1105 def __str__(self):
1106 return u"string is " + self.x
1107 def __format__(self, format_spec):
1108 if format_spec == 'd':
1109 return u'G(' + self.x + u')'
1110 return object.__format__(self, format_spec)
1111
1112 # class that returns a bad type from __format__
1113 class H:
1114 def __format__(self, format_spec):
1115 return 1.0
1116
1117 class I(datetime.date):
1118 def __format__(self, format_spec):
1119 return self.strftime(format_spec)
1120
1121 class J(int):
1122 def __format__(self, format_spec):
1123 return int.__format__(self * 2, format_spec)
1124
1125
1126 self.assertEqual(u''.format(), u'')
1127 self.assertEqual(u'abc'.format(), u'abc')
1128 self.assertEqual(u'{0}'.format(u'abc'), u'abc')
1129 self.assertEqual(u'{0:}'.format(u'abc'), u'abc')
1130 self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc')
1131 self.assertEqual(u'{0}X'.format(u'abc'), u'abcX')
1132 self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY')
1133 self.assertEqual(u'{1}'.format(1, u'abc'), u'abc')
1134 self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc')
1135 self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX')
1136 self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY')
1137 self.assertEqual(u'{0}'.format(-15), u'-15')
1138 self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc')
1139 self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc')
1140 self.assertEqual(u'{{'.format(), u'{')
1141 self.assertEqual(u'}}'.format(), u'}')
1142 self.assertEqual(u'{{}}'.format(), u'{}')
1143 self.assertEqual(u'{{x}}'.format(), u'{x}')
1144 self.assertEqual(u'{{{0}}}'.format(123), u'{123}')
1145 self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}')
1146 self.assertEqual(u'}}{{'.format(), u'}{')
1147 self.assertEqual(u'}}x{{'.format(), u'}x{')
1148
1149 # weird field names
1150 self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz')
1151 self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz')
1152 self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3')
1153
1154 self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20')
1155 self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010')
1156 self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc')
1157 self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc')
1158 self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def')
1159 self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def')
1160 self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def')
1161
1162 # strings
1163 self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc')
1164 self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab')
1165 self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc')
1166 self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'')
1167 self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc')
1168 self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc')
1169 self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab')
1170 self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ')
1171 self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result')
1172 self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result')
1173 self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result')
1174 self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx')
1175 self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx')
1176 self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ')
1177 self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ')
1178 self.assertEqual(u'{0:>7s}'.format(u'result'), u' result')
1179 self.assertEqual(u'{0:>8s}'.format(u'result'), u' result')
1180 self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ')
1181 self.assertEqual(u'{0:^9s}'.format(u'result'), u' result ')
1182 self.assertEqual(u'{0:^10s}'.format(u'result'), u' result ')
1183 self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999)
1184 self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000)
1185 self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000)
1186
1187 # format specifiers for user defined type
1188 self.assertEqual(u'{0:abc}'.format(C()), u'abc')
1189
1190 # !r and !s coersions
1191 self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello')
1192 self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello')
1193 self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello ')
1194 self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello ')
1195 self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'")
1196 self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'")
1197 self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)')
1198
1199 # test fallback to object.__format__
1200 self.assertEqual(u'{0}'.format({}), u'{}')
1201 self.assertEqual(u'{0}'.format([]), u'[]')
1202 self.assertEqual(u'{0}'.format([1]), u'[1]')
1203 self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)')
1204 self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data) ')
1205 self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data) ')
1206 self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)')
1207 self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')
1208 self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data')
1209
Eric Smith00475112009-03-14 14:43:27 +00001210 self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007,
1211 month=8,
1212 day=27)),
1213 u"date: 2007-08-27")
Eric Smitha9f7d622008-02-17 19:46:49 +00001214
1215 # test deriving from a builtin type and overriding __format__
Eric Smith00475112009-03-14 14:43:27 +00001216 self.assertEqual(u"{0}".format(J(10)), u"20")
Eric Smitha9f7d622008-02-17 19:46:49 +00001217
1218
1219 # string format specifiers
Eric Smith00475112009-03-14 14:43:27 +00001220 self.assertEqual(u'{0:}'.format('a'), u'a')
Eric Smitha9f7d622008-02-17 19:46:49 +00001221
1222 # computed format specifiers
Eric Smith00475112009-03-14 14:43:27 +00001223 self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello')
1224 self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello')
1225 self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello')
1226 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello ')
1227 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello ')
Eric Smitha9f7d622008-02-17 19:46:49 +00001228
1229 # test various errors
Eric Smith00475112009-03-14 14:43:27 +00001230 self.assertRaises(ValueError, u'{'.format)
1231 self.assertRaises(ValueError, u'}'.format)
1232 self.assertRaises(ValueError, u'a{'.format)
1233 self.assertRaises(ValueError, u'a}'.format)
1234 self.assertRaises(ValueError, u'{a'.format)
1235 self.assertRaises(ValueError, u'}a'.format)
1236 self.assertRaises(IndexError, u'{0}'.format)
1237 self.assertRaises(IndexError, u'{1}'.format, u'abc')
1238 self.assertRaises(KeyError, u'{x}'.format)
1239 self.assertRaises(ValueError, u"}{".format)
1240 self.assertRaises(ValueError, u"{".format)
1241 self.assertRaises(ValueError, u"}".format)
1242 self.assertRaises(ValueError, u"abc{0:{}".format)
1243 self.assertRaises(ValueError, u"{0".format)
1244 self.assertRaises(IndexError, u"{0.}".format)
1245 self.assertRaises(ValueError, u"{0.}".format, 0)
1246 self.assertRaises(IndexError, u"{0[}".format)
1247 self.assertRaises(ValueError, u"{0[}".format, [])
1248 self.assertRaises(KeyError, u"{0]}".format)
1249 self.assertRaises(ValueError, u"{0.[]}".format, 0)
1250 self.assertRaises(ValueError, u"{0..foo}".format, 0)
1251 self.assertRaises(ValueError, u"{0[0}".format, 0)
1252 self.assertRaises(ValueError, u"{0[0:foo}".format, 0)
1253 self.assertRaises(KeyError, u"{c]}".format)
1254 self.assertRaises(ValueError, u"{{ {{{0}}".format, 0)
1255 self.assertRaises(ValueError, u"{0}}".format, 0)
1256 self.assertRaises(KeyError, u"{foo}".format, bar=3)
1257 self.assertRaises(ValueError, u"{0!x}".format, 3)
1258 self.assertRaises(ValueError, u"{0!}".format, 0)
1259 self.assertRaises(ValueError, u"{0!rs}".format, 0)
1260 self.assertRaises(ValueError, u"{!}".format)
1261 self.assertRaises(ValueError, u"{:}".format)
1262 self.assertRaises(ValueError, u"{:s}".format)
1263 self.assertRaises(ValueError, u"{}".format)
Benjamin Petersoneacc8732010-06-07 22:38:19 +00001264 big = u"23098475029384702983476098230754973209482573"
1265 self.assertRaises(ValueError, (u"{" + big + u"}").format)
1266 self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0])
Eric Smitha9f7d622008-02-17 19:46:49 +00001267
Eric Smithf73758f2009-05-23 14:04:31 +00001268 # issue 6089
1269 self.assertRaises(ValueError, u"{0[0]x}".format, [None])
1270 self.assertRaises(ValueError, u"{0[0](10)}".format, [None])
1271
Eric Smitha9f7d622008-02-17 19:46:49 +00001272 # can't have a replacement on the field name portion
Eric Smith00475112009-03-14 14:43:27 +00001273 self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4)
Eric Smitha9f7d622008-02-17 19:46:49 +00001274
1275 # exceed maximum recursion depth
Eric Smith00475112009-03-14 14:43:27 +00001276 self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'')
1277 self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
Eric Smitha9f7d622008-02-17 19:46:49 +00001278 0, 1, 2, 3, 4, 5, 6, 7)
1279
1280 # string format spec errors
Eric Smith00475112009-03-14 14:43:27 +00001281 self.assertRaises(ValueError, u"{0:-s}".format, u'')
1282 self.assertRaises(ValueError, format, u"", u"-")
1283 self.assertRaises(ValueError, u"{0:=s}".format, u'')
Anthony Baxter67b6d512006-03-30 10:54:07 +00001284
Eric Smithbc32fee2008-02-18 18:02:34 +00001285 # test combining string and unicode
1286 self.assertEqual(u"foo{0}".format('bar'), u'foobar')
1287 # This will try to convert the argument from unicode to str, which
1288 # will succeed
1289 self.assertEqual("foo{0}".format(u'bar'), 'foobar')
1290 # This will try to convert the argument from unicode to str, which
1291 # will fail
1292 self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar')
1293
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +00001294 def test_raiseMemError(self):
1295 # Ensure that the freelist contains a consistent object, even
1296 # when a string allocation fails with a MemoryError.
1297 # This used to crash the interpreter,
1298 # or leak references when the number was smaller.
Antoine Pitrou187ac1b2008-09-05 22:04:54 +00001299 charwidth = 4 if sys.maxunicode >= 0x10000 else 2
1300 # Note: sys.maxsize is half of the actual max allocation because of
1301 # the signedness of Py_ssize_t.
1302 alloc = lambda: u"a" * (sys.maxsize // charwidth * 2)
Antoine Pitroufd7c43e2008-08-17 17:01:49 +00001303 self.assertRaises(MemoryError, alloc)
1304 self.assertRaises(MemoryError, alloc)
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +00001305
Victor Stinner4fd2ff92010-03-22 12:56:39 +00001306 def test_format_subclass(self):
1307 class U(unicode):
1308 def __unicode__(self):
1309 return u'__unicode__ overridden'
1310 u = U(u'xxx')
1311 self.assertEquals("%s" % u, u'__unicode__ overridden')
1312 self.assertEquals("{0}".format(u), u'__unicode__ overridden')
1313
1314
Walter Dörwald28256f22003-01-19 16:59:20 +00001315def test_main():
Collin Winterc2898c52007-04-25 17:29:52 +00001316 test_support.run_unittest(__name__)
Barry Warsaw817918c2002-08-06 16:58:21 +00001317
Walter Dörwald28256f22003-01-19 16:59:20 +00001318if __name__ == "__main__":
1319 test_main()