blob: 53c7800c9a3dd0eab65b45885599fa11200519a2 [file] [log] [blame]
Martin v. Löwisa729daf2002-08-04 17:28:33 +00001# -*- coding: iso-8859-1 -*-
Guido van Rossuma831cac2000-03-10 23:23:21 +00002""" Test script for the Unicode implementation.
3
Guido van Rossuma831cac2000-03-10 23:23:21 +00004Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
Marc-André Lemburg36619082001-01-17 19:11:13 +00008"""#"
Christian Heimesc5f05e42008-02-23 17:40:11 +00009import sys, struct, codecs
Walter Dörwald0fd583c2003-02-21 12:53:50 +000010from test import test_support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000011
Neal Norwitz430f68b2005-11-24 22:00:56 +000012# Error handling (bad decoder return)
13def search_function(encoding):
14 def decode1(input, errors="strict"):
15 return 42 # not a tuple
16 def encode1(input, errors="strict"):
17 return 42 # not a tuple
18 def encode2(input, errors="strict"):
19 return (42, 42) # no unicode
20 def decode2(input, errors="strict"):
21 return (42, 42) # no unicode
22 if encoding=="test.unicode1":
23 return (encode1, decode1, None, None)
24 elif encoding=="test.unicode2":
25 return (encode2, decode2, None, None)
26 else:
27 return None
28codecs.register(search_function)
29
Walter Dörwald0fd583c2003-02-21 12:53:50 +000030class UnicodeTest(
31 string_tests.CommonTest,
Walter Dörwald57d88e52004-08-26 16:53:04 +000032 string_tests.MixinStrUnicodeUserStringTest,
33 string_tests.MixinStrUnicodeTest,
Walter Dörwald0fd583c2003-02-21 12:53:50 +000034 ):
35 type2test = unicode
36
37 def checkequalnofix(self, result, object, methodname, *args):
38 method = getattr(object, methodname)
39 realresult = method(*args)
40 self.assertEqual(realresult, result)
41 self.assert_(type(realresult) is type(result))
42
43 # if the original is returned make sure that
44 # this doesn't happen with subclasses
45 if realresult is object:
46 class usub(unicode):
47 def __repr__(self):
48 return 'usub(%r)' % unicode.__repr__(self)
49 object = usub(object)
50 method = getattr(object, methodname)
51 realresult = method(*args)
52 self.assertEqual(realresult, result)
53 self.assert_(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000054
Jeremy Hylton504de6b2003-10-06 05:08:26 +000055 def test_literals(self):
56 self.assertEqual(u'\xff', u'\u00ff')
57 self.assertEqual(u'\uffff', u'\U0000ffff')
Kurt B. Kaiserdb98f362007-07-18 19:58:42 +000058 self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'')
59 self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'')
60 self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000)
Jeremy Hylton504de6b2003-10-06 05:08:26 +000061
Walter Dörwald28256f22003-01-19 16:59:20 +000062 def test_repr(self):
63 if not sys.platform.startswith('java'):
64 # Test basic sanity of repr()
65 self.assertEqual(repr(u'abc'), "u'abc'")
66 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
67 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
68 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
69 self.assertEqual(repr(u'\\'), "u'\\\\'")
70 self.assertEqual(repr(u'\n'), "u'\\n'")
71 self.assertEqual(repr(u'\r'), "u'\\r'")
72 self.assertEqual(repr(u'\t'), "u'\\t'")
73 self.assertEqual(repr(u'\b'), "u'\\x08'")
74 self.assertEqual(repr(u"'\""), """u'\\'"'""")
75 self.assertEqual(repr(u"'\""), """u'\\'"'""")
76 self.assertEqual(repr(u"'"), '''u"'"''')
77 self.assertEqual(repr(u'"'), """u'"'""")
78 latin1repr = (
79 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
80 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
81 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
82 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
83 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
84 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
85 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
86 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
87 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
88 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
89 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
90 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
91 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
92 "\\xfe\\xff'")
93 testrepr = repr(u''.join(map(unichr, xrange(256))))
94 self.assertEqual(testrepr, latin1repr)
Neal Norwitz17753ec2006-08-21 22:21:19 +000095 # Test repr works on wide unicode escapes without overflow.
96 self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
97 repr(u"\U00010000" * 39 + u"\uffff" * 4096))
98
Walter Dörwald28256f22003-01-19 16:59:20 +000099
Walter Dörwald28256f22003-01-19 16:59:20 +0000100 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000101 string_tests.CommonTest.test_count(self)
102 # check mixed argument types
103 self.checkequalnofix(3, 'aaa', 'count', u'a')
104 self.checkequalnofix(0, 'aaa', 'count', u'b')
105 self.checkequalnofix(3, u'aaa', 'count', 'a')
106 self.checkequalnofix(0, u'aaa', 'count', 'b')
107 self.checkequalnofix(0, u'aaa', 'count', 'b')
108 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
109 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
110 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
111 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000112
Walter Dörwald28256f22003-01-19 16:59:20 +0000113 def test_find(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000114 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
115 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
116 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000117
Walter Dörwald28256f22003-01-19 16:59:20 +0000118 self.assertRaises(TypeError, u'hello'.find)
119 self.assertRaises(TypeError, u'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000120
Walter Dörwald28256f22003-01-19 16:59:20 +0000121 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000122 string_tests.CommonTest.test_rfind(self)
123 # check mixed argument types
124 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
125 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
126 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000127
Walter Dörwald28256f22003-01-19 16:59:20 +0000128 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000129 string_tests.CommonTest.test_index(self)
130 # check mixed argument types
131 for (t1, t2) in ((str, unicode), (unicode, str)):
132 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
133 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
134 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
135 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
136 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
137 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
138 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
139 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000140
Walter Dörwald28256f22003-01-19 16:59:20 +0000141 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000142 string_tests.CommonTest.test_rindex(self)
143 # check mixed argument types
144 for (t1, t2) in ((str, unicode), (unicode, str)):
145 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
146 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
147 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
148 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000149
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000150 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
151 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
152 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
153 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
154 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000155
Walter Dörwald28256f22003-01-19 16:59:20 +0000156 def test_translate(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000157 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
158 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
159 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
160 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
161 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
Walter Dörwaldcd736e72004-02-05 17:36:00 +0000162 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000163
Walter Dörwald28256f22003-01-19 16:59:20 +0000164 self.assertRaises(TypeError, u'hello'.translate)
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000165 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000166
Walter Dörwald28256f22003-01-19 16:59:20 +0000167 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000168 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000169
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000170 # Mixed arguments
171 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
172 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
173 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000174
Walter Dörwald28256f22003-01-19 16:59:20 +0000175 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000176 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000177
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000178 # mixed arguments
179 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
180 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
181 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
182 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
183 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
184 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
185 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
Marc-André Lemburge5034372000-08-08 08:04:29 +0000186
Walter Dörwald28256f22003-01-19 16:59:20 +0000187 def test_strip(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000188 string_tests.CommonTest.test_strip(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000189 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000190
Walter Dörwald28256f22003-01-19 16:59:20 +0000191 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000192 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000193
Walter Dörwald28256f22003-01-19 16:59:20 +0000194 # method call forwarded from str implementation because of unicode argument
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000195 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
Walter Dörwald28256f22003-01-19 16:59:20 +0000196 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000197
Walter Dörwald28256f22003-01-19 16:59:20 +0000198 def test_comparison(self):
199 # Comparisons:
200 self.assertEqual(u'abc', 'abc')
201 self.assertEqual('abc', u'abc')
202 self.assertEqual(u'abc', u'abc')
203 self.assert_(u'abcd' > 'abc')
204 self.assert_('abcd' > u'abc')
205 self.assert_(u'abcd' > u'abc')
206 self.assert_(u'abc' < 'abcd')
207 self.assert_('abc' < u'abcd')
208 self.assert_(u'abc' < u'abcd')
209
210 if 0:
211 # Move these tests to a Unicode collation module test...
212 # Testing UTF-16 code point order comparisons...
213
214 # No surrogates, no fixup required.
215 self.assert_(u'\u0061' < u'\u20ac')
216 # Non surrogate below surrogate value, no fixup required
217 self.assert_(u'\u0061' < u'\ud800\udc02')
218
219 # Non surrogate above surrogate value, fixup required
220 def test_lecmp(s, s2):
221 self.assert_(s < s2)
222
223 def test_fixup(s):
224 s2 = u'\ud800\udc01'
225 test_lecmp(s, s2)
226 s2 = u'\ud900\udc01'
227 test_lecmp(s, s2)
228 s2 = u'\uda00\udc01'
229 test_lecmp(s, s2)
230 s2 = u'\udb00\udc01'
231 test_lecmp(s, s2)
232 s2 = u'\ud800\udd01'
233 test_lecmp(s, s2)
234 s2 = u'\ud900\udd01'
235 test_lecmp(s, s2)
236 s2 = u'\uda00\udd01'
237 test_lecmp(s, s2)
238 s2 = u'\udb00\udd01'
239 test_lecmp(s, s2)
240 s2 = u'\ud800\ude01'
241 test_lecmp(s, s2)
242 s2 = u'\ud900\ude01'
243 test_lecmp(s, s2)
244 s2 = u'\uda00\ude01'
245 test_lecmp(s, s2)
246 s2 = u'\udb00\ude01'
247 test_lecmp(s, s2)
248 s2 = u'\ud800\udfff'
249 test_lecmp(s, s2)
250 s2 = u'\ud900\udfff'
251 test_lecmp(s, s2)
252 s2 = u'\uda00\udfff'
253 test_lecmp(s, s2)
254 s2 = u'\udb00\udfff'
255 test_lecmp(s, s2)
256
257 test_fixup(u'\ue000')
258 test_fixup(u'\uff61')
259
260 # Surrogates on both sides, no fixup required
261 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
262
Walter Dörwald28256f22003-01-19 16:59:20 +0000263 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000264 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
265 self.checkequalnofix(False, u'\u1FFc', 'islower')
Walter Dörwald28256f22003-01-19 16:59:20 +0000266
267 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000268 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
269 if not sys.platform.startswith('java'):
270 self.checkequalnofix(False, u'\u1FFc', 'isupper')
Walter Dörwald28256f22003-01-19 16:59:20 +0000271
272 def test_istitle(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000273 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
274 self.checkequalnofix(True, u'\u1FFc', 'istitle')
275 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000276
277 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000278 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
279 self.checkequalnofix(True, u'\u2000', 'isspace')
280 self.checkequalnofix(True, u'\u200a', 'isspace')
281 self.checkequalnofix(False, u'\u2014', 'isspace')
Walter Dörwald28256f22003-01-19 16:59:20 +0000282
283 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000284 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
285 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
Walter Dörwald28256f22003-01-19 16:59:20 +0000286
287 def test_isdecimal(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000288 self.checkequalnofix(False, u'', 'isdecimal')
289 self.checkequalnofix(False, u'a', 'isdecimal')
290 self.checkequalnofix(True, u'0', 'isdecimal')
291 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
292 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
293 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
294 self.checkequalnofix(True, u'0123456789', 'isdecimal')
295 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000296
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000297 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000298
299 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000300 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
301 self.checkequalnofix(True, u'\u2460', 'isdigit')
302 self.checkequalnofix(False, u'\xbc', 'isdigit')
303 self.checkequalnofix(True, u'\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000304
305 def test_isnumeric(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000306 self.checkequalnofix(False, u'', 'isnumeric')
307 self.checkequalnofix(False, u'a', 'isnumeric')
308 self.checkequalnofix(True, u'0', 'isnumeric')
309 self.checkequalnofix(True, u'\u2460', 'isnumeric')
310 self.checkequalnofix(True, u'\xbc', 'isnumeric')
311 self.checkequalnofix(True, u'\u0660', 'isnumeric')
312 self.checkequalnofix(True, u'0123456789', 'isnumeric')
313 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000314
315 self.assertRaises(TypeError, u"abc".isnumeric, 42)
316
Walter Dörwald28256f22003-01-19 16:59:20 +0000317 def test_contains(self):
318 # Testing Unicode contains method
319 self.assert_('a' in u'abdb')
320 self.assert_('a' in u'bdab')
321 self.assert_('a' in u'bdaba')
322 self.assert_('a' in u'bdba')
323 self.assert_('a' in u'bdba')
324 self.assert_(u'a' in u'bdba')
325 self.assert_(u'a' not in u'bdb')
326 self.assert_(u'a' not in 'bdb')
327 self.assert_(u'a' in 'bdba')
328 self.assert_(u'a' in ('a',1,None))
329 self.assert_(u'a' in (1,None,'a'))
330 self.assert_(u'a' in (1,None,u'a'))
331 self.assert_('a' in ('a',1,None))
332 self.assert_('a' in (1,None,'a'))
333 self.assert_('a' in (1,None,u'a'))
334 self.assert_('a' not in ('x',1,u'y'))
335 self.assert_('a' not in ('x',1,None))
336 self.assert_(u'abcd' not in u'abcxxxx')
337 self.assert_(u'ab' in u'abcd')
338 self.assert_('ab' in u'abc')
339 self.assert_(u'ab' in 'abc')
340 self.assert_(u'ab' in (1,None,u'ab'))
341 self.assert_(u'' in u'abc')
342 self.assert_('' in u'abc')
343
344 # If the following fails either
345 # the contains operator does not propagate UnicodeErrors or
346 # someone has changed the default encoding
347 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
348
349 self.assert_(u'' in '')
350 self.assert_('' in u'')
351 self.assert_(u'' in u'')
352 self.assert_(u'' in 'abc')
353 self.assert_('' in u'abc')
354 self.assert_(u'' in u'abc')
355 self.assert_(u'\0' not in 'abc')
356 self.assert_('\0' not in u'abc')
357 self.assert_(u'\0' not in u'abc')
358 self.assert_(u'\0' in '\0abc')
359 self.assert_('\0' in u'\0abc')
360 self.assert_(u'\0' in u'\0abc')
361 self.assert_(u'\0' in 'abc\0')
362 self.assert_('\0' in u'abc\0')
363 self.assert_(u'\0' in u'abc\0')
364 self.assert_(u'a' in '\0abc')
365 self.assert_('a' in u'\0abc')
366 self.assert_(u'a' in u'\0abc')
367 self.assert_(u'asdf' in 'asdf')
368 self.assert_('asdf' in u'asdf')
369 self.assert_(u'asdf' in u'asdf')
370 self.assert_(u'asdf' not in 'asd')
371 self.assert_('asdf' not in u'asd')
372 self.assert_(u'asdf' not in u'asd')
373 self.assert_(u'asdf' not in '')
374 self.assert_('asdf' not in u'')
375 self.assert_(u'asdf' not in u'')
376
377 self.assertRaises(TypeError, u"abc".__contains__)
378
379 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000380 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000381 # Testing Unicode formatting strings...
382 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
383 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
384 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
385 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
386 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
387 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000388 if not sys.platform.startswith('java'):
389 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
390 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000391 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000392
Walter Dörwald43440a62003-03-31 18:07:50 +0000393 self.assertEqual(u'%c' % 0x1234, u'\u1234')
Walter Dörwald44f527f2003-04-02 16:37:24 +0000394 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
Stefan Krahae7dd8f2010-07-19 18:24:18 +0000395 self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3))
Walter Dörwald28256f22003-01-19 16:59:20 +0000396
Victor Stinnerf7270ba2010-02-23 23:20:14 +0000397 for num in range(0x00,0x80):
398 char = chr(num)
399 self.assertEqual(u"%c" % char, char)
400 self.assertEqual(u"%c" % num, char)
401 # Issue 7649
402 for num in range(0x80,0x100):
403 uchar = unichr(num)
404 self.assertEqual(uchar, u"%c" % num) # works only with ints
405 self.assertEqual(uchar, u"%c" % uchar) # and unicode chars
406 # the implicit decoding should fail for non-ascii chars
407 self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num))
408 self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num))
409
Walter Dörwald28256f22003-01-19 16:59:20 +0000410 # formatting jobs delegated from the string implementation:
411 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
412 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
413 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
414 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
415 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
416 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
417 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
418 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
419 self.assertEqual('...%s...' % u"abc", u'...abc...')
420 self.assertEqual('%*s' % (5,u'abc',), u' abc')
421 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
422 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
423 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
424 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000425 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
Walter Dörwald43440a62003-03-31 18:07:50 +0000426 self.assertEqual('%c' % u'a', u'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000427 class Wrapper:
428 def __str__(self):
429 return u'\u1234'
430 self.assertEqual('%s' % Wrapper(), u'\u1234')
Tim Peters4511a712006-05-03 04:46:14 +0000431
Georg Brandlde9b6242006-04-30 11:13:56 +0000432 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000433 def test_format_float(self):
Georg Brandlde9b6242006-04-30 11:13:56 +0000434 # should not format with a comma, but always with C locale
435 self.assertEqual(u'1.0', u'%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +0000436
Walter Dörwald28256f22003-01-19 16:59:20 +0000437 def test_constructor(self):
438 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
439
440 self.assertEqual(
441 unicode(u'unicode remains unicode'),
442 u'unicode remains unicode'
443 )
444
445 class UnicodeSubclass(unicode):
Marc-André Lemburg79f57832002-12-29 19:44:06 +0000446 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +0000447
Walter Dörwald28256f22003-01-19 16:59:20 +0000448 self.assertEqual(
449 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
450 u'unicode subclass becomes unicode'
451 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000452
Walter Dörwald28256f22003-01-19 16:59:20 +0000453 self.assertEqual(
454 unicode('strings are converted to unicode'),
455 u'strings are converted to unicode'
456 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000457
Walter Dörwald28256f22003-01-19 16:59:20 +0000458 class UnicodeCompat:
459 def __init__(self, x):
460 self.x = x
461 def __unicode__(self):
462 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000463
Walter Dörwald28256f22003-01-19 16:59:20 +0000464 self.assertEqual(
465 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
466 u'__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000467
Walter Dörwald28256f22003-01-19 16:59:20 +0000468 class StringCompat:
469 def __init__(self, x):
470 self.x = x
471 def __str__(self):
472 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000473
Walter Dörwald28256f22003-01-19 16:59:20 +0000474 self.assertEqual(
475 unicode(StringCompat('__str__ compatible objects are recognized')),
476 u'__str__ compatible objects are recognized'
477 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000478
Walter Dörwald28256f22003-01-19 16:59:20 +0000479 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000480
Walter Dörwald28256f22003-01-19 16:59:20 +0000481 o = StringCompat('unicode(obj) is compatible to str()')
482 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
483 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000484
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000485 # %-formatting and .__unicode__()
486 self.assertEqual(u'%s' %
487 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
488 u"u'%s' % obj uses obj.__unicode__()")
489 self.assertEqual(u'%s' %
490 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
491 u"u'%s' % obj falls back to obj.__str__()")
492
Walter Dörwald28256f22003-01-19 16:59:20 +0000493 for obj in (123, 123.45, 123L):
494 self.assertEqual(unicode(obj), unicode(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000495
Walter Dörwald28256f22003-01-19 16:59:20 +0000496 # unicode(obj, encoding, error) tests (this maps to
497 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000498
Walter Dörwald28256f22003-01-19 16:59:20 +0000499 if not sys.platform.startswith('java'):
500 self.assertRaises(
501 TypeError,
502 unicode,
503 u'decoding unicode is not supported',
504 'utf-8',
505 'strict'
506 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000507
Walter Dörwald28256f22003-01-19 16:59:20 +0000508 self.assertEqual(
509 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
510 u'strings are decoded to unicode'
511 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000512
Walter Dörwald28256f22003-01-19 16:59:20 +0000513 if not sys.platform.startswith('java'):
514 self.assertEqual(
515 unicode(
516 buffer('character buffers are decoded to unicode'),
517 'utf-8',
518 'strict'
519 ),
520 u'character buffers are decoded to unicode'
521 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000522
Walter Dörwald28256f22003-01-19 16:59:20 +0000523 self.assertRaises(TypeError, unicode, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000524
Walter Dörwald28256f22003-01-19 16:59:20 +0000525 def test_codecs_utf7(self):
526 utfTests = [
527 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
528 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
529 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
530 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
531 (u'+', '+-'),
532 (u'+-', '+--'),
533 (u'+?', '+-?'),
534 (u'\?', '+AFw?'),
535 (u'+?', '+-?'),
536 (ur'\\?', '+AFwAXA?'),
537 (ur'\\\?', '+AFwAXABc?'),
538 (ur'++--', '+-+---')
539 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000540
Walter Dörwald28256f22003-01-19 16:59:20 +0000541 for (x, y) in utfTests:
542 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000543
Walter Dörwald28256f22003-01-19 16:59:20 +0000544 # surrogates not supported
545 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000546
Walter Dörwald28256f22003-01-19 16:59:20 +0000547 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000548
Antoine Pitrou4982d5d2008-07-25 17:45:59 +0000549 # Issue #2242: crash on some Windows/MSVC versions
550 self.assertRaises(UnicodeDecodeError, '+\xc1'.decode, 'utf-7')
551
Walter Dörwald28256f22003-01-19 16:59:20 +0000552 def test_codecs_utf8(self):
553 self.assertEqual(u''.encode('utf-8'), '')
554 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
555 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
556 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
557 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
558 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
559 self.assertEqual(
560 (u'\ud800\udc02'*1000).encode('utf-8'),
561 '\xf0\x90\x80\x82'*1000
562 )
563 self.assertEqual(
564 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
565 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
566 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
567 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
568 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
569 u' Nunstuck git und'.encode('utf-8'),
570 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
571 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
572 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
573 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
574 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
575 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
576 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
577 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
578 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
579 '\xe3\x80\x8cWenn ist das Nunstuck git und'
580 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000581
Walter Dörwald28256f22003-01-19 16:59:20 +0000582 # UTF-8 specific decoding tests
583 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
584 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
585 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000586
Walter Dörwald28256f22003-01-19 16:59:20 +0000587 # Other possible utf-8 test cases:
588 # * strict decoding testing for all of the
589 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000590
Ezio Melotti86e5e172010-07-03 05:34:39 +0000591 def test_utf8_decode_valid_sequences(self):
592 sequences = [
593 # single byte
594 ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
595 # 2 bytes
596 ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
597 # 3 bytes
598 ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
599 ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
600 # 4 bytes
601 ('\xF0\x90\x80\x80', u'\U00010000'),
602 ('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
603 ]
604 for seq, res in sequences:
605 self.assertEqual(seq.decode('utf-8'), res)
606
607 for ch in map(unichr, range(0, sys.maxunicode)):
608 self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
609
610 def test_utf8_decode_invalid_sequences(self):
611 # continuation bytes in a sequence of 2, 3, or 4 bytes
612 continuation_bytes = map(chr, range(0x80, 0xC0))
613 # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
614 invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
615 # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
616 invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
617 invalid_start_bytes = (
618 continuation_bytes + invalid_2B_seq_start_bytes +
619 invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
620 )
621
622 for byte in invalid_start_bytes:
623 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
624
625 for sb in invalid_2B_seq_start_bytes:
626 for cb in continuation_bytes:
627 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
628
629 for sb in invalid_4B_seq_start_bytes:
630 for cb1 in continuation_bytes[:3]:
631 for cb3 in continuation_bytes[:3]:
632 self.assertRaises(UnicodeDecodeError,
633 (sb+cb1+'\x80'+cb3).decode, 'utf-8')
634
635 for cb in map(chr, range(0x80, 0xA0)):
636 self.assertRaises(UnicodeDecodeError,
637 ('\xE0'+cb+'\x80').decode, 'utf-8')
638 self.assertRaises(UnicodeDecodeError,
639 ('\xE0'+cb+'\xBF').decode, 'utf-8')
640 # XXX: surrogates shouldn't be valid UTF-8!
641 # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
642 # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
643 #for cb in map(chr, range(0xA0, 0xC0)):
644 #sys.__stdout__.write('\\xED\\x%02x\\x80\n' % ord(cb))
645 #self.assertRaises(UnicodeDecodeError,
646 #('\xED'+cb+'\x80').decode, 'utf-8')
647 #self.assertRaises(UnicodeDecodeError,
648 #('\xED'+cb+'\xBF').decode, 'utf-8')
649 for cb in map(chr, range(0x80, 0x90)):
650 self.assertRaises(UnicodeDecodeError,
651 ('\xF0'+cb+'\x80\x80').decode, 'utf-8')
652 self.assertRaises(UnicodeDecodeError,
653 ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
654 for cb in map(chr, range(0x90, 0xC0)):
655 self.assertRaises(UnicodeDecodeError,
656 ('\xF4'+cb+'\x80\x80').decode, 'utf-8')
657 self.assertRaises(UnicodeDecodeError,
658 ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
659
660 def test_issue8271(self):
661 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
662 # only the start byte and the continuation byte(s) are now considered
663 # invalid, instead of the number of bytes specified by the start byte.
664 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
665 # table 3-8, Row 2) for more information about the algorithm used.
666 FFFD = u'\ufffd'
667 sequences = [
668 # invalid start bytes
669 ('\x80', FFFD), # continuation byte
670 ('\x80\x80', FFFD*2), # 2 continuation bytes
671 ('\xc0', FFFD),
672 ('\xc0\xc0', FFFD*2),
673 ('\xc1', FFFD),
674 ('\xc1\xc0', FFFD*2),
675 ('\xc0\xc1', FFFD*2),
676 # with start byte of a 2-byte sequence
677 ('\xc2', FFFD), # only the start byte
678 ('\xc2\xc2', FFFD*2), # 2 start bytes
679 ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
680 ('\xc2\x41', FFFD+'A'), # invalid continuation byte
681 # with start byte of a 3-byte sequence
682 ('\xe1', FFFD), # only the start byte
683 ('\xe1\xe1', FFFD*2), # 2 start bytes
684 ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
685 ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
686 ('\xe1\x80', FFFD), # only 1 continuation byte
687 ('\xe1\x41', FFFD+'A'), # invalid continuation byte
688 ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
689 ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
690 ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
691 ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
692 ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
693 # with start byte of a 4-byte sequence
694 ('\xf1', FFFD), # only the start byte
695 ('\xf1\xf1', FFFD*2), # 2 start bytes
696 ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
697 ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
698 ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
699 ('\xf1\x80', FFFD), # only 1 continuation bytes
700 ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
701 ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
702 ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
703 ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
704 ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
705 ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
706 ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
707 ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
708 ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
709 ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
710 ('\xf1\xf1\x80\x41', FFFD*2+'A'),
711 ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
712 # with invalid start byte of a 4-byte sequence (rfc2279)
713 ('\xf5', FFFD), # only the start byte
714 ('\xf5\xf5', FFFD*2), # 2 start bytes
715 ('\xf5\x80', FFFD*2), # only 1 continuation byte
716 ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
717 ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
718 ('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
719 ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
720 ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
721 # with invalid start byte of a 5-byte sequence (rfc2279)
722 ('\xf8', FFFD), # only the start byte
723 ('\xf8\xf8', FFFD*2), # 2 start bytes
724 ('\xf8\x80', FFFD*2), # only one continuation byte
725 ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
726 ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
727 # with invalid start byte of a 6-byte sequence (rfc2279)
728 ('\xfc', FFFD), # only the start byte
729 ('\xfc\xfc', FFFD*2), # 2 start bytes
730 ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
731 ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
732 # invalid start byte
733 ('\xfe', FFFD),
734 ('\xfe\x80\x80', FFFD*3),
735 # other sequences
736 ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
737 ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
738 ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
739 ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
740 u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
741 ]
742 for n, (seq, res) in enumerate(sequences):
743 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
744 self.assertEqual(seq.decode('utf-8', 'replace'), res)
745 self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
746 self.assertEqual(seq.decode('utf-8', 'ignore'),
747 res.replace(u'\uFFFD', ''))
748
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000749 def test_codecs_idna(self):
750 # Test whether trailing dot is preserved
751 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
752
Walter Dörwald28256f22003-01-19 16:59:20 +0000753 def test_codecs_errors(self):
754 # Error handling (encoding)
755 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
756 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
757 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
758 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000759
Walter Dörwald28256f22003-01-19 16:59:20 +0000760 # Error handling (decoding)
761 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
762 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
763 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
764 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000765
Walter Dörwald28256f22003-01-19 16:59:20 +0000766 # Error handling (unknown character names)
767 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000768
Walter Dörwald28256f22003-01-19 16:59:20 +0000769 # Error handling (truncated escape sequence)
770 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000771
Walter Dörwald28256f22003-01-19 16:59:20 +0000772 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
773 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
774 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
775 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
776 # executes PyUnicode_Encode()
777 import imp
778 self.assertRaises(
779 ImportError,
780 imp.find_module,
781 "non-existing module",
782 [u"non-existing dir"]
783 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000784
Walter Dörwald28256f22003-01-19 16:59:20 +0000785 # Error handling (wrong arguments)
786 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000787
Walter Dörwald28256f22003-01-19 16:59:20 +0000788 # Error handling (PyUnicode_EncodeDecimal())
789 self.assertRaises(UnicodeError, int, u"\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +0000790
Walter Dörwald28256f22003-01-19 16:59:20 +0000791 def test_codecs(self):
792 # Encoding
793 self.assertEqual(u'hello'.encode('ascii'), 'hello')
794 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
795 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
796 self.assertEqual(u'hello'.encode('utf8'), 'hello')
797 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
798 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
799 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +0000800
Walter Dörwald28256f22003-01-19 16:59:20 +0000801 # Roundtrip safety for BMP (just the first 1024 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000802 for c in xrange(1024):
803 u = unichr(c)
804 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
805 'utf-16-be', 'raw_unicode_escape',
806 'unicode_escape', 'unicode_internal'):
807 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +0000808
Walter Dörwald28256f22003-01-19 16:59:20 +0000809 # Roundtrip safety for BMP (just the first 256 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000810 for c in xrange(256):
811 u = unichr(c)
812 for encoding in ('latin-1',):
813 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000814
Walter Dörwald28256f22003-01-19 16:59:20 +0000815 # Roundtrip safety for BMP (just the first 128 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000816 for c in xrange(128):
817 u = unichr(c)
818 for encoding in ('ascii',):
819 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000820
Walter Dörwald28256f22003-01-19 16:59:20 +0000821 # Roundtrip safety for non-BMP (just a few chars)
822 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
823 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
824 #'raw_unicode_escape',
825 'unicode_escape', 'unicode_internal'):
826 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000827
Walter Dörwald28256f22003-01-19 16:59:20 +0000828 # UTF-8 must be roundtrip safe for all UCS-2 code points
829 # This excludes surrogates: in the full range, there would be
830 # a surrogate pair (\udbff\udc00), which gets converted back
831 # to a non-BMP character (\U0010fc00)
832 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
833 for encoding in ('utf-8',):
834 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000835
Walter Dörwald28256f22003-01-19 16:59:20 +0000836 def test_codecs_charmap(self):
837 # 0-127
838 s = ''.join(map(chr, xrange(128)))
839 for encoding in (
840 'cp037', 'cp1026',
841 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
842 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
843 'cp863', 'cp865', 'cp866',
844 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
845 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
846 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
847 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000848
Walter Dörwald28256f22003-01-19 16:59:20 +0000849 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
850 'cp1256', 'cp1257', 'cp1258',
851 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000852
Walter Dörwald28256f22003-01-19 16:59:20 +0000853 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
854 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000855
Walter Dörwald28256f22003-01-19 16:59:20 +0000856 ### These have undefined mappings:
857 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000858
Walter Dörwald28256f22003-01-19 16:59:20 +0000859 ### These fail the round-trip:
860 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +0000861
Walter Dörwald28256f22003-01-19 16:59:20 +0000862 ):
863 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000864
Walter Dörwald28256f22003-01-19 16:59:20 +0000865 # 128-255
866 s = ''.join(map(chr, xrange(128, 256)))
867 for encoding in (
868 'cp037', 'cp1026',
869 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
870 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
871 'cp863', 'cp865', 'cp866',
872 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
873 'iso8859_2', 'iso8859_4', 'iso8859_5',
874 'iso8859_9', 'koi8_r', 'latin_1',
875 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000876
Walter Dörwald28256f22003-01-19 16:59:20 +0000877 ### These have undefined mappings:
878 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
879 #'cp1256', 'cp1257', 'cp1258',
880 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
881 #'iso8859_3', 'iso8859_6', 'iso8859_7',
882 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000883
Walter Dörwald28256f22003-01-19 16:59:20 +0000884 ### These fail the round-trip:
885 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +0000886
Walter Dörwald28256f22003-01-19 16:59:20 +0000887 ):
888 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000889
Walter Dörwald28256f22003-01-19 16:59:20 +0000890 def test_concatenation(self):
891 self.assertEqual((u"abc" u"def"), u"abcdef")
892 self.assertEqual(("abc" u"def"), u"abcdef")
893 self.assertEqual((u"abc" "def"), u"abcdef")
894 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
895 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +0000896
Walter Dörwald28256f22003-01-19 16:59:20 +0000897 def test_printing(self):
898 class BitBucket:
899 def write(self, text):
900 pass
Fred Drake004d5e62000-10-23 17:22:08 +0000901
Walter Dörwald28256f22003-01-19 16:59:20 +0000902 out = BitBucket()
903 print >>out, u'abc'
904 print >>out, u'abc', u'def'
905 print >>out, u'abc', 'def'
906 print >>out, 'abc', u'def'
907 print >>out, u'abc\n'
908 print >>out, u'abc\n',
909 print >>out, u'abc\n',
910 print >>out, u'def\n'
911 print >>out, u'def\n'
Fred Drake004d5e62000-10-23 17:22:08 +0000912
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +0000913 def test_ucs4(self):
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +0000914 x = u'\U00100000'
915 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
916 self.assertEqual(x, y)
917
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +0000918 y = r'\U00100000'
919 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
920 self.assertEqual(x, y)
921 y = r'\U00010000'
922 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
923 self.assertEqual(x, y)
924
925 try:
926 '\U11111111'.decode("raw-unicode-escape")
927 except UnicodeDecodeError as e:
928 self.assertEqual(e.start, 0)
929 self.assertEqual(e.end, 10)
930 else:
931 self.fail("Should have raised UnicodeDecodeError")
932
Brett Cannonc3647ac2005-04-26 03:45:26 +0000933 def test_conversion(self):
934 # Make sure __unicode__() works properly
935 class Foo0:
936 def __str__(self):
937 return "foo"
938
939 class Foo1:
940 def __unicode__(self):
941 return u"foo"
942
943 class Foo2(object):
944 def __unicode__(self):
945 return u"foo"
946
947 class Foo3(object):
948 def __unicode__(self):
949 return "foo"
950
951 class Foo4(str):
952 def __unicode__(self):
953 return "foo"
954
955 class Foo5(unicode):
956 def __unicode__(self):
957 return "foo"
958
959 class Foo6(str):
960 def __str__(self):
961 return "foos"
962
963 def __unicode__(self):
964 return u"foou"
965
966 class Foo7(unicode):
967 def __str__(self):
968 return "foos"
969 def __unicode__(self):
970 return u"foou"
971
972 class Foo8(unicode):
973 def __new__(cls, content=""):
974 return unicode.__new__(cls, 2*content)
975 def __unicode__(self):
976 return self
977
978 class Foo9(unicode):
979 def __str__(self):
980 return "string"
981 def __unicode__(self):
982 return "not unicode"
983
984 self.assertEqual(unicode(Foo0()), u"foo")
985 self.assertEqual(unicode(Foo1()), u"foo")
986 self.assertEqual(unicode(Foo2()), u"foo")
987 self.assertEqual(unicode(Foo3()), u"foo")
988 self.assertEqual(unicode(Foo4("bar")), u"foo")
989 self.assertEqual(unicode(Foo5("bar")), u"foo")
990 self.assertEqual(unicode(Foo6("bar")), u"foou")
991 self.assertEqual(unicode(Foo7("bar")), u"foou")
992 self.assertEqual(unicode(Foo8("foo")), u"foofoo")
993 self.assertEqual(str(Foo9("foo")), "string")
994 self.assertEqual(unicode(Foo9("foo")), u"not unicode")
995
Anthony Baxter67b6d512006-03-30 10:54:07 +0000996 def test_unicode_repr(self):
997 class s1:
998 def __repr__(self):
999 return '\\n'
1000
1001 class s2:
1002 def __repr__(self):
1003 return u'\\n'
1004
1005 self.assertEqual(repr(s1()), '\\n')
1006 self.assertEqual(repr(s2()), '\\n')
1007
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00001008 def test_expandtabs_overflows_gracefully(self):
1009 # This test only affects 32-bit platforms because expandtabs can only take
1010 # an int as the max value, not a 64-bit C long. If expandtabs is changed
1011 # to take a 64-bit long, this test should apply to all platforms.
Neal Norwitzba965de2007-06-11 02:14:39 +00001012 if sys.maxint > (1 << 32) or struct.calcsize('P') != 4:
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00001013 return
1014 self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
Anthony Baxter67b6d512006-03-30 10:54:07 +00001015
Eric Smitha9f7d622008-02-17 19:46:49 +00001016 def test__format__(self):
1017 def test(value, format, expected):
1018 # test both with and without the trailing 's'
1019 self.assertEqual(value.__format__(format), expected)
1020 self.assertEqual(value.__format__(format + u's'), expected)
1021
1022 test(u'', u'', u'')
1023 test(u'abc', u'', u'abc')
1024 test(u'abc', u'.3', u'abc')
1025 test(u'ab', u'.3', u'ab')
1026 test(u'abcdef', u'.3', u'abc')
1027 test(u'abcdef', u'.0', u'')
1028 test(u'abc', u'3.3', u'abc')
1029 test(u'abc', u'2.3', u'abc')
1030 test(u'abc', u'2.2', u'ab')
1031 test(u'abc', u'3.2', u'ab ')
1032 test(u'result', u'x<0', u'result')
1033 test(u'result', u'x<5', u'result')
1034 test(u'result', u'x<6', u'result')
1035 test(u'result', u'x<7', u'resultx')
1036 test(u'result', u'x<8', u'resultxx')
1037 test(u'result', u' <7', u'result ')
1038 test(u'result', u'<7', u'result ')
1039 test(u'result', u'>7', u' result')
1040 test(u'result', u'>8', u' result')
1041 test(u'result', u'^8', u' result ')
1042 test(u'result', u'^9', u' result ')
1043 test(u'result', u'^10', u' result ')
1044 test(u'a', u'10000', u'a' + u' ' * 9999)
1045 test(u'', u'10000', u' ' * 10000)
1046 test(u'', u'10000000', u' ' * 10000000)
1047
1048 # test mixing unicode and str
1049 self.assertEqual(u'abc'.__format__('s'), u'abc')
1050 self.assertEqual(u'abc'.__format__('->10s'), u'-------abc')
1051
1052 def test_format(self):
1053 self.assertEqual(u''.format(), u'')
1054 self.assertEqual(u'a'.format(), u'a')
1055 self.assertEqual(u'ab'.format(), u'ab')
1056 self.assertEqual(u'a{{'.format(), u'a{')
1057 self.assertEqual(u'a}}'.format(), u'a}')
1058 self.assertEqual(u'{{b'.format(), u'{b')
1059 self.assertEqual(u'}}b'.format(), u'}b')
1060 self.assertEqual(u'a{{b'.format(), u'a{b')
1061
1062 # examples from the PEP:
1063 import datetime
1064 self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred")
1065 self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')),
1066 u"My name is Fred")
1067 self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'),
1068 u"My name is Fred :-{}")
1069
1070 # datetime.__format__ doesn't work with unicode
1071 #d = datetime.date(2007, 8, 18)
1072 #self.assertEqual("The year is {0.year}".format(d),
1073 # "The year is 2007")
1074
1075 # classes we'll use for testing
1076 class C:
1077 def __init__(self, x=100):
1078 self._x = x
1079 def __format__(self, spec):
1080 return spec
1081
1082 class D:
1083 def __init__(self, x):
1084 self.x = x
1085 def __format__(self, spec):
1086 return str(self.x)
1087
1088 # class with __str__, but no __format__
1089 class E:
1090 def __init__(self, x):
1091 self.x = x
1092 def __str__(self):
1093 return u'E(' + self.x + u')'
1094
1095 # class with __repr__, but no __format__ or __str__
1096 class F:
1097 def __init__(self, x):
1098 self.x = x
1099 def __repr__(self):
1100 return u'F(' + self.x + u')'
1101
1102 # class with __format__ that forwards to string, for some format_spec's
1103 class G:
1104 def __init__(self, x):
1105 self.x = x
1106 def __str__(self):
1107 return u"string is " + self.x
1108 def __format__(self, format_spec):
1109 if format_spec == 'd':
1110 return u'G(' + self.x + u')'
1111 return object.__format__(self, format_spec)
1112
1113 # class that returns a bad type from __format__
1114 class H:
1115 def __format__(self, format_spec):
1116 return 1.0
1117
1118 class I(datetime.date):
1119 def __format__(self, format_spec):
1120 return self.strftime(format_spec)
1121
1122 class J(int):
1123 def __format__(self, format_spec):
1124 return int.__format__(self * 2, format_spec)
1125
1126
1127 self.assertEqual(u''.format(), u'')
1128 self.assertEqual(u'abc'.format(), u'abc')
1129 self.assertEqual(u'{0}'.format(u'abc'), u'abc')
1130 self.assertEqual(u'{0:}'.format(u'abc'), u'abc')
1131 self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc')
1132 self.assertEqual(u'{0}X'.format(u'abc'), u'abcX')
1133 self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY')
1134 self.assertEqual(u'{1}'.format(1, u'abc'), u'abc')
1135 self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc')
1136 self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX')
1137 self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY')
1138 self.assertEqual(u'{0}'.format(-15), u'-15')
1139 self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc')
1140 self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc')
1141 self.assertEqual(u'{{'.format(), u'{')
1142 self.assertEqual(u'}}'.format(), u'}')
1143 self.assertEqual(u'{{}}'.format(), u'{}')
1144 self.assertEqual(u'{{x}}'.format(), u'{x}')
1145 self.assertEqual(u'{{{0}}}'.format(123), u'{123}')
1146 self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}')
1147 self.assertEqual(u'}}{{'.format(), u'}{')
1148 self.assertEqual(u'}}x{{'.format(), u'}x{')
1149
1150 # weird field names
1151 self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz')
1152 self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz')
1153 self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3')
1154
1155 self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20')
1156 self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010')
1157 self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc')
1158 self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc')
1159 self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def')
1160 self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def')
1161 self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def')
1162
1163 # strings
1164 self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc')
1165 self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab')
1166 self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc')
1167 self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'')
1168 self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc')
1169 self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc')
1170 self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab')
1171 self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ')
1172 self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result')
1173 self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result')
1174 self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result')
1175 self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx')
1176 self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx')
1177 self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ')
1178 self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ')
1179 self.assertEqual(u'{0:>7s}'.format(u'result'), u' result')
1180 self.assertEqual(u'{0:>8s}'.format(u'result'), u' result')
1181 self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ')
1182 self.assertEqual(u'{0:^9s}'.format(u'result'), u' result ')
1183 self.assertEqual(u'{0:^10s}'.format(u'result'), u' result ')
1184 self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999)
1185 self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000)
1186 self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000)
1187
1188 # format specifiers for user defined type
1189 self.assertEqual(u'{0:abc}'.format(C()), u'abc')
1190
1191 # !r and !s coersions
1192 self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello')
1193 self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello')
1194 self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello ')
1195 self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello ')
1196 self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'")
1197 self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'")
1198 self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)')
1199
1200 # test fallback to object.__format__
1201 self.assertEqual(u'{0}'.format({}), u'{}')
1202 self.assertEqual(u'{0}'.format([]), u'[]')
1203 self.assertEqual(u'{0}'.format([1]), u'[1]')
1204 self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)')
1205 self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data) ')
1206 self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data) ')
1207 self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)')
1208 self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')
1209 self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data')
1210
Eric Smith00475112009-03-14 14:43:27 +00001211 self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007,
1212 month=8,
1213 day=27)),
1214 u"date: 2007-08-27")
Eric Smitha9f7d622008-02-17 19:46:49 +00001215
1216 # test deriving from a builtin type and overriding __format__
Eric Smith00475112009-03-14 14:43:27 +00001217 self.assertEqual(u"{0}".format(J(10)), u"20")
Eric Smitha9f7d622008-02-17 19:46:49 +00001218
1219
1220 # string format specifiers
Eric Smith00475112009-03-14 14:43:27 +00001221 self.assertEqual(u'{0:}'.format('a'), u'a')
Eric Smitha9f7d622008-02-17 19:46:49 +00001222
1223 # computed format specifiers
Eric Smith00475112009-03-14 14:43:27 +00001224 self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello')
1225 self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello')
1226 self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello')
1227 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello ')
1228 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello ')
Eric Smitha9f7d622008-02-17 19:46:49 +00001229
1230 # test various errors
Eric Smith00475112009-03-14 14:43:27 +00001231 self.assertRaises(ValueError, u'{'.format)
1232 self.assertRaises(ValueError, u'}'.format)
1233 self.assertRaises(ValueError, u'a{'.format)
1234 self.assertRaises(ValueError, u'a}'.format)
1235 self.assertRaises(ValueError, u'{a'.format)
1236 self.assertRaises(ValueError, u'}a'.format)
1237 self.assertRaises(IndexError, u'{0}'.format)
1238 self.assertRaises(IndexError, u'{1}'.format, u'abc')
1239 self.assertRaises(KeyError, u'{x}'.format)
1240 self.assertRaises(ValueError, u"}{".format)
1241 self.assertRaises(ValueError, u"{".format)
1242 self.assertRaises(ValueError, u"}".format)
1243 self.assertRaises(ValueError, u"abc{0:{}".format)
1244 self.assertRaises(ValueError, u"{0".format)
1245 self.assertRaises(IndexError, u"{0.}".format)
1246 self.assertRaises(ValueError, u"{0.}".format, 0)
1247 self.assertRaises(IndexError, u"{0[}".format)
1248 self.assertRaises(ValueError, u"{0[}".format, [])
1249 self.assertRaises(KeyError, u"{0]}".format)
1250 self.assertRaises(ValueError, u"{0.[]}".format, 0)
1251 self.assertRaises(ValueError, u"{0..foo}".format, 0)
1252 self.assertRaises(ValueError, u"{0[0}".format, 0)
1253 self.assertRaises(ValueError, u"{0[0:foo}".format, 0)
1254 self.assertRaises(KeyError, u"{c]}".format)
1255 self.assertRaises(ValueError, u"{{ {{{0}}".format, 0)
1256 self.assertRaises(ValueError, u"{0}}".format, 0)
1257 self.assertRaises(KeyError, u"{foo}".format, bar=3)
1258 self.assertRaises(ValueError, u"{0!x}".format, 3)
1259 self.assertRaises(ValueError, u"{0!}".format, 0)
1260 self.assertRaises(ValueError, u"{0!rs}".format, 0)
1261 self.assertRaises(ValueError, u"{!}".format)
1262 self.assertRaises(ValueError, u"{:}".format)
1263 self.assertRaises(ValueError, u"{:s}".format)
1264 self.assertRaises(ValueError, u"{}".format)
Benjamin Petersoneacc8732010-06-07 22:38:19 +00001265 big = u"23098475029384702983476098230754973209482573"
1266 self.assertRaises(ValueError, (u"{" + big + u"}").format)
1267 self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0])
Eric Smitha9f7d622008-02-17 19:46:49 +00001268
Eric Smithf73758f2009-05-23 14:04:31 +00001269 # issue 6089
1270 self.assertRaises(ValueError, u"{0[0]x}".format, [None])
1271 self.assertRaises(ValueError, u"{0[0](10)}".format, [None])
1272
Eric Smitha9f7d622008-02-17 19:46:49 +00001273 # can't have a replacement on the field name portion
Eric Smith00475112009-03-14 14:43:27 +00001274 self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4)
Eric Smitha9f7d622008-02-17 19:46:49 +00001275
1276 # exceed maximum recursion depth
Eric Smith00475112009-03-14 14:43:27 +00001277 self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'')
1278 self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
Eric Smitha9f7d622008-02-17 19:46:49 +00001279 0, 1, 2, 3, 4, 5, 6, 7)
1280
1281 # string format spec errors
Eric Smith00475112009-03-14 14:43:27 +00001282 self.assertRaises(ValueError, u"{0:-s}".format, u'')
1283 self.assertRaises(ValueError, format, u"", u"-")
1284 self.assertRaises(ValueError, u"{0:=s}".format, u'')
Anthony Baxter67b6d512006-03-30 10:54:07 +00001285
Eric Smithbc32fee2008-02-18 18:02:34 +00001286 # test combining string and unicode
1287 self.assertEqual(u"foo{0}".format('bar'), u'foobar')
1288 # This will try to convert the argument from unicode to str, which
1289 # will succeed
1290 self.assertEqual("foo{0}".format(u'bar'), 'foobar')
1291 # This will try to convert the argument from unicode to str, which
1292 # will fail
1293 self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar')
1294
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +00001295 def test_raiseMemError(self):
1296 # Ensure that the freelist contains a consistent object, even
1297 # when a string allocation fails with a MemoryError.
1298 # This used to crash the interpreter,
1299 # or leak references when the number was smaller.
Antoine Pitrou187ac1b2008-09-05 22:04:54 +00001300 charwidth = 4 if sys.maxunicode >= 0x10000 else 2
1301 # Note: sys.maxsize is half of the actual max allocation because of
1302 # the signedness of Py_ssize_t.
1303 alloc = lambda: u"a" * (sys.maxsize // charwidth * 2)
Antoine Pitroufd7c43e2008-08-17 17:01:49 +00001304 self.assertRaises(MemoryError, alloc)
1305 self.assertRaises(MemoryError, alloc)
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +00001306
Victor Stinner4fd2ff92010-03-22 12:56:39 +00001307 def test_format_subclass(self):
1308 class U(unicode):
1309 def __unicode__(self):
1310 return u'__unicode__ overridden'
1311 u = U(u'xxx')
1312 self.assertEquals("%s" % u, u'__unicode__ overridden')
1313 self.assertEquals("{0}".format(u), u'__unicode__ overridden')
1314
1315
Walter Dörwald28256f22003-01-19 16:59:20 +00001316def test_main():
Collin Winterc2898c52007-04-25 17:29:52 +00001317 test_support.run_unittest(__name__)
Barry Warsaw817918c2002-08-06 16:58:21 +00001318
Walter Dörwald28256f22003-01-19 16:59:20 +00001319if __name__ == "__main__":
1320 test_main()