blob: 399eed7e31f40656a15e5a0de0e9193bf3088f76 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Ezio Melotti12682b12011-08-22 23:46:30 +03008import sys
9import struct
10import codecs
11import unittest
Walter Dörwald0fd583c2003-02-21 12:53:50 +000012from test import test_support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000013
Ezio Melotti12682b12011-08-22 23:46:30 +030014# decorator to skip tests on narrow builds
15requires_wide_build = unittest.skipIf(sys.maxunicode == 65535,
16 'requires wide build')
17
Neal Norwitz430f68b2005-11-24 22:00:56 +000018# Error handling (bad decoder return)
19def search_function(encoding):
20 def decode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 def decode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 if encoding=="test.unicode1":
29 return (encode1, decode1, None, None)
30 elif encoding=="test.unicode2":
31 return (encode2, decode2, None, None)
32 else:
33 return None
34codecs.register(search_function)
35
Walter Dörwald0fd583c2003-02-21 12:53:50 +000036class UnicodeTest(
37 string_tests.CommonTest,
Walter Dörwald57d88e52004-08-26 16:53:04 +000038 string_tests.MixinStrUnicodeUserStringTest,
39 string_tests.MixinStrUnicodeTest,
Walter Dörwald0fd583c2003-02-21 12:53:50 +000040 ):
41 type2test = unicode
42
Florent Xiclunac0c0b142010-09-13 08:53:00 +000043 def assertEqual(self, first, second, msg=None):
44 # strict assertEqual method: reject implicit bytes/unicode equality
45 super(UnicodeTest, self).assertEqual(first, second, msg)
46 if isinstance(first, unicode) or isinstance(second, unicode):
47 self.assertIsInstance(first, unicode)
48 self.assertIsInstance(second, unicode)
49 elif isinstance(first, str) or isinstance(second, str):
50 self.assertIsInstance(first, str)
51 self.assertIsInstance(second, str)
52
Walter Dörwald0fd583c2003-02-21 12:53:50 +000053 def checkequalnofix(self, result, object, methodname, *args):
54 method = getattr(object, methodname)
55 realresult = method(*args)
56 self.assertEqual(realresult, result)
Benjamin Peterson5c8da862009-06-30 22:57:08 +000057 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000058
59 # if the original is returned make sure that
60 # this doesn't happen with subclasses
61 if realresult is object:
62 class usub(unicode):
63 def __repr__(self):
64 return 'usub(%r)' % unicode.__repr__(self)
65 object = usub(object)
66 method = getattr(object, methodname)
67 realresult = method(*args)
68 self.assertEqual(realresult, result)
Benjamin Peterson5c8da862009-06-30 22:57:08 +000069 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000070
Jeremy Hylton504de6b2003-10-06 05:08:26 +000071 def test_literals(self):
72 self.assertEqual(u'\xff', u'\u00ff')
73 self.assertEqual(u'\uffff', u'\U0000ffff')
Kurt B. Kaiserdb98f362007-07-18 19:58:42 +000074 self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'')
75 self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'')
76 self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000)
Jeremy Hylton504de6b2003-10-06 05:08:26 +000077
Walter Dörwald28256f22003-01-19 16:59:20 +000078 def test_repr(self):
79 if not sys.platform.startswith('java'):
80 # Test basic sanity of repr()
81 self.assertEqual(repr(u'abc'), "u'abc'")
82 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
83 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
84 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
85 self.assertEqual(repr(u'\\'), "u'\\\\'")
86 self.assertEqual(repr(u'\n'), "u'\\n'")
87 self.assertEqual(repr(u'\r'), "u'\\r'")
88 self.assertEqual(repr(u'\t'), "u'\\t'")
89 self.assertEqual(repr(u'\b'), "u'\\x08'")
90 self.assertEqual(repr(u"'\""), """u'\\'"'""")
91 self.assertEqual(repr(u"'\""), """u'\\'"'""")
92 self.assertEqual(repr(u"'"), '''u"'"''')
93 self.assertEqual(repr(u'"'), """u'"'""")
94 latin1repr = (
95 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
96 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
97 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
98 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
99 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
100 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
101 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
102 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
103 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
104 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
105 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
106 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
107 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
108 "\\xfe\\xff'")
109 testrepr = repr(u''.join(map(unichr, xrange(256))))
110 self.assertEqual(testrepr, latin1repr)
Neal Norwitz17753ec2006-08-21 22:21:19 +0000111 # Test repr works on wide unicode escapes without overflow.
112 self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
113 repr(u"\U00010000" * 39 + u"\uffff" * 4096))
114
Walter Dörwald28256f22003-01-19 16:59:20 +0000115
Walter Dörwald28256f22003-01-19 16:59:20 +0000116 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000117 string_tests.CommonTest.test_count(self)
118 # check mixed argument types
119 self.checkequalnofix(3, 'aaa', 'count', u'a')
120 self.checkequalnofix(0, 'aaa', 'count', u'b')
121 self.checkequalnofix(3, u'aaa', 'count', 'a')
122 self.checkequalnofix(0, u'aaa', 'count', 'b')
123 self.checkequalnofix(0, u'aaa', 'count', 'b')
124 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
125 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
126 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
127 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000128
Walter Dörwald28256f22003-01-19 16:59:20 +0000129 def test_find(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000130 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
131 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
132 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000133
Walter Dörwald28256f22003-01-19 16:59:20 +0000134 self.assertRaises(TypeError, u'hello'.find)
135 self.assertRaises(TypeError, u'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000136
Walter Dörwald28256f22003-01-19 16:59:20 +0000137 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000138 string_tests.CommonTest.test_rfind(self)
139 # check mixed argument types
140 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
141 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
142 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000143
Walter Dörwald28256f22003-01-19 16:59:20 +0000144 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000145 string_tests.CommonTest.test_index(self)
146 # check mixed argument types
147 for (t1, t2) in ((str, unicode), (unicode, str)):
148 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
149 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
150 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
151 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
152 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
153 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
154 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
155 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000156
Walter Dörwald28256f22003-01-19 16:59:20 +0000157 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000158 string_tests.CommonTest.test_rindex(self)
159 # check mixed argument types
160 for (t1, t2) in ((str, unicode), (unicode, str)):
161 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
162 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
163 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
164 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000165
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000166 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
167 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
168 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
169 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
170 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000171
Walter Dörwald28256f22003-01-19 16:59:20 +0000172 def test_translate(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000173 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
174 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
175 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
176 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
177 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
Walter Dörwaldcd736e72004-02-05 17:36:00 +0000178 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000179
Walter Dörwald28256f22003-01-19 16:59:20 +0000180 self.assertRaises(TypeError, u'hello'.translate)
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000181 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000182
Walter Dörwald28256f22003-01-19 16:59:20 +0000183 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000184 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000185
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000186 # Mixed arguments
187 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
188 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
189 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000190
Walter Dörwald28256f22003-01-19 16:59:20 +0000191 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000192 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000193
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000194 # mixed arguments
195 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
196 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
197 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
198 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
199 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
200 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
201 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
Marc-André Lemburge5034372000-08-08 08:04:29 +0000202
Walter Dörwald28256f22003-01-19 16:59:20 +0000203 def test_strip(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000204 string_tests.CommonTest.test_strip(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000205 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000206
Walter Dörwald28256f22003-01-19 16:59:20 +0000207 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000208 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000209
Walter Dörwald28256f22003-01-19 16:59:20 +0000210 # method call forwarded from str implementation because of unicode argument
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000211 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
Walter Dörwald28256f22003-01-19 16:59:20 +0000212 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000213
Walter Dörwald28256f22003-01-19 16:59:20 +0000214 def test_comparison(self):
215 # Comparisons:
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000216 self.assertTrue(u'abc' == 'abc')
217 self.assertTrue('abc' == u'abc')
218 self.assertTrue(u'abc' == u'abc')
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000219 self.assertTrue(u'abcd' > 'abc')
220 self.assertTrue('abcd' > u'abc')
221 self.assertTrue(u'abcd' > u'abc')
222 self.assertTrue(u'abc' < 'abcd')
223 self.assertTrue('abc' < u'abcd')
224 self.assertTrue(u'abc' < u'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000225
226 if 0:
227 # Move these tests to a Unicode collation module test...
228 # Testing UTF-16 code point order comparisons...
229
230 # No surrogates, no fixup required.
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000231 self.assertTrue(u'\u0061' < u'\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000232 # Non surrogate below surrogate value, no fixup required
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000233 self.assertTrue(u'\u0061' < u'\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000234
235 # Non surrogate above surrogate value, fixup required
236 def test_lecmp(s, s2):
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000237 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000238
239 def test_fixup(s):
240 s2 = u'\ud800\udc01'
241 test_lecmp(s, s2)
242 s2 = u'\ud900\udc01'
243 test_lecmp(s, s2)
244 s2 = u'\uda00\udc01'
245 test_lecmp(s, s2)
246 s2 = u'\udb00\udc01'
247 test_lecmp(s, s2)
248 s2 = u'\ud800\udd01'
249 test_lecmp(s, s2)
250 s2 = u'\ud900\udd01'
251 test_lecmp(s, s2)
252 s2 = u'\uda00\udd01'
253 test_lecmp(s, s2)
254 s2 = u'\udb00\udd01'
255 test_lecmp(s, s2)
256 s2 = u'\ud800\ude01'
257 test_lecmp(s, s2)
258 s2 = u'\ud900\ude01'
259 test_lecmp(s, s2)
260 s2 = u'\uda00\ude01'
261 test_lecmp(s, s2)
262 s2 = u'\udb00\ude01'
263 test_lecmp(s, s2)
264 s2 = u'\ud800\udfff'
265 test_lecmp(s, s2)
266 s2 = u'\ud900\udfff'
267 test_lecmp(s, s2)
268 s2 = u'\uda00\udfff'
269 test_lecmp(s, s2)
270 s2 = u'\udb00\udfff'
271 test_lecmp(s, s2)
272
273 test_fixup(u'\ue000')
274 test_fixup(u'\uff61')
275
276 # Surrogates on both sides, no fixup required
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000277 self.assertTrue(u'\ud800\udc02' < u'\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000278
Ezio Melottiea7b6f62011-08-15 10:04:28 +0300279 def test_capitalize(self):
280 string_tests.CommonTest.test_capitalize(self)
281 # check that titlecased chars are lowered correctly
282 # \u1ffc is the titlecased char
283 self.checkequal(u'\u1ffc\u1ff3\u1ff3\u1ff3',
284 u'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
285 # check with cased non-letter chars
286 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
287 u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize')
288 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
289 u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize')
290 self.checkequal(u'\u2160\u2171\u2172',
291 u'\u2160\u2161\u2162', 'capitalize')
292 self.checkequal(u'\u2160\u2171\u2172',
293 u'\u2170\u2171\u2172', 'capitalize')
294 # check with Ll chars with no upper - nothing changes here
295 self.checkequal(u'\u019b\u1d00\u1d86\u0221\u1fb7',
296 u'\u019b\u1d00\u1d86\u0221\u1fb7', 'capitalize')
297
Walter Dörwald28256f22003-01-19 16:59:20 +0000298 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000299 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
300 self.checkequalnofix(False, u'\u1FFc', 'islower')
Walter Dörwald28256f22003-01-19 16:59:20 +0000301
Ezio Melotti12682b12011-08-22 23:46:30 +0300302 @requires_wide_build
303 def test_islower_non_bmp(self):
304 # non-BMP, uppercase
305 self.assertFalse(u'\U00010401'.islower())
306 self.assertFalse(u'\U00010427'.islower())
307 # non-BMP, lowercase
308 self.assertTrue(u'\U00010429'.islower())
309 self.assertTrue(u'\U0001044E'.islower())
310 # non-BMP, non-cased
311 self.assertFalse(u'\U0001F40D'.islower())
312 self.assertFalse(u'\U0001F46F'.islower())
313
Walter Dörwald28256f22003-01-19 16:59:20 +0000314 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000315 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
316 if not sys.platform.startswith('java'):
317 self.checkequalnofix(False, u'\u1FFc', 'isupper')
Walter Dörwald28256f22003-01-19 16:59:20 +0000318
Ezio Melotti12682b12011-08-22 23:46:30 +0300319 @requires_wide_build
320 def test_isupper_non_bmp(self):
321 # non-BMP, uppercase
322 self.assertTrue(u'\U00010401'.isupper())
323 self.assertTrue(u'\U00010427'.isupper())
324 # non-BMP, lowercase
325 self.assertFalse(u'\U00010429'.isupper())
326 self.assertFalse(u'\U0001044E'.isupper())
327 # non-BMP, non-cased
328 self.assertFalse(u'\U0001F40D'.isupper())
329 self.assertFalse(u'\U0001F46F'.isupper())
330
Walter Dörwald28256f22003-01-19 16:59:20 +0000331 def test_istitle(self):
Ezio Melotti12682b12011-08-22 23:46:30 +0300332 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000333 self.checkequalnofix(True, u'\u1FFc', 'istitle')
334 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000335
Ezio Melotti12682b12011-08-22 23:46:30 +0300336 @requires_wide_build
337 def test_istitle_non_bmp(self):
338 # non-BMP, uppercase + lowercase
339 self.assertTrue(u'\U00010401\U00010429'.istitle())
340 self.assertTrue(u'\U00010427\U0001044E'.istitle())
341 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
342 for ch in [u'\U00010429', u'\U0001044E', u'\U0001F40D', u'\U0001F46F']:
343 self.assertFalse(ch.istitle(), '{!r} is not title'.format(ch))
344
Walter Dörwald28256f22003-01-19 16:59:20 +0000345 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000346 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
347 self.checkequalnofix(True, u'\u2000', 'isspace')
348 self.checkequalnofix(True, u'\u200a', 'isspace')
349 self.checkequalnofix(False, u'\u2014', 'isspace')
Walter Dörwald28256f22003-01-19 16:59:20 +0000350
Ezio Melotti12682b12011-08-22 23:46:30 +0300351 @requires_wide_build
352 def test_isspace_non_bmp(self):
353 # apparently there are no non-BMP spaces chars in Unicode 6
354 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
355 u'\U0001F40D', u'\U0001F46F']:
356 self.assertFalse(ch.isspace(), '{!r} is not space.'.format(ch))
357
358 @requires_wide_build
359 def test_isalnum_non_bmp(self):
360 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
361 u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
362 self.assertTrue(ch.isalnum(), '{!r} is alnum.'.format(ch))
363
Walter Dörwald28256f22003-01-19 16:59:20 +0000364 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000365 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
366 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
Walter Dörwald28256f22003-01-19 16:59:20 +0000367
Ezio Melotti12682b12011-08-22 23:46:30 +0300368 @requires_wide_build
369 def test_isalpha_non_bmp(self):
370 # non-BMP, cased
371 self.assertTrue(u'\U00010401'.isalpha())
372 self.assertTrue(u'\U00010427'.isalpha())
373 self.assertTrue(u'\U00010429'.isalpha())
374 self.assertTrue(u'\U0001044E'.isalpha())
375 # non-BMP, non-cased
376 self.assertFalse(u'\U0001F40D'.isalpha())
377 self.assertFalse(u'\U0001F46F'.isalpha())
378
Walter Dörwald28256f22003-01-19 16:59:20 +0000379 def test_isdecimal(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000380 self.checkequalnofix(False, u'', 'isdecimal')
381 self.checkequalnofix(False, u'a', 'isdecimal')
382 self.checkequalnofix(True, u'0', 'isdecimal')
383 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
384 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
385 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
386 self.checkequalnofix(True, u'0123456789', 'isdecimal')
387 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000388
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000389 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000390
Ezio Melotti12682b12011-08-22 23:46:30 +0300391 @requires_wide_build
392 def test_isdecimal_non_bmp(self):
393 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
394 u'\U0001F40D', u'\U0001F46F', u'\U00011065', u'\U0001F107']:
395 self.assertFalse(ch.isdecimal(), '{!r} is not decimal.'.format(ch))
396 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0']:
397 self.assertTrue(ch.isdecimal(), '{!r} is decimal.'.format(ch))
398
Walter Dörwald28256f22003-01-19 16:59:20 +0000399 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000400 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
401 self.checkequalnofix(True, u'\u2460', 'isdigit')
402 self.checkequalnofix(False, u'\xbc', 'isdigit')
403 self.checkequalnofix(True, u'\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000404
Ezio Melotti12682b12011-08-22 23:46:30 +0300405 @requires_wide_build
406 def test_isdigit_non_bmp(self):
407 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
408 u'\U0001F40D', u'\U0001F46F', u'\U00011065']:
409 self.assertFalse(ch.isdigit(), '{!r} is not a digit.'.format(ch))
410 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
411 self.assertTrue(ch.isdigit(), '{!r} is a digit.'.format(ch))
412
Walter Dörwald28256f22003-01-19 16:59:20 +0000413 def test_isnumeric(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000414 self.checkequalnofix(False, u'', 'isnumeric')
415 self.checkequalnofix(False, u'a', 'isnumeric')
416 self.checkequalnofix(True, u'0', 'isnumeric')
417 self.checkequalnofix(True, u'\u2460', 'isnumeric')
418 self.checkequalnofix(True, u'\xbc', 'isnumeric')
419 self.checkequalnofix(True, u'\u0660', 'isnumeric')
420 self.checkequalnofix(True, u'0123456789', 'isnumeric')
421 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000422
423 self.assertRaises(TypeError, u"abc".isnumeric, 42)
424
Ezio Melotti12682b12011-08-22 23:46:30 +0300425 @requires_wide_build
426 def test_isnumeric_non_bmp(self):
427 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
428 u'\U0001F40D', u'\U0001F46F']:
429 self.assertFalse(ch.isnumeric(), '{!r} is not numeric.'.format(ch))
430 for ch in [u'\U00010107', u'\U0001D7F6', u'\U00023b1b',
431 u'\U000104A0', u'\U0001F107']:
432 self.assertTrue(ch.isnumeric(), '{!r} is numeric.'.format(ch))
433
434 @requires_wide_build
435 def test_surrogates(self):
436 # this test actually passes on narrow too, but it's just by accident.
437 # Surrogates are seen as non-cased chars, so u'X\uD800X' is as
438 # uppercase as 'X X'
439 for s in (u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
440 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
441 self.assertTrue(s.islower())
442 self.assertFalse(s.isupper())
443 self.assertFalse(s.istitle())
444 for s in (u'A\uD800B\uDFFF', u'A\uDFFFB\uD800',
445 u'A\uD800B\uDFFFA', u'A\uDFFFB\uD800A'):
446 self.assertFalse(s.islower())
447 self.assertTrue(s.isupper())
448 self.assertTrue(s.istitle())
449
450 for meth_name in ('islower', 'isupper', 'istitle'):
451 meth = getattr(unicode, meth_name)
452 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF'):
453 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
454
455 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
456 'isdecimal', 'isnumeric'):
457 meth = getattr(unicode, meth_name)
458 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF',
459 u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
460 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
461 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
462
463
464 @requires_wide_build
465 def test_lower(self):
466 string_tests.CommonTest.test_lower(self)
467 self.assertEqual(u'\U00010427'.lower(), u'\U0001044F')
468 self.assertEqual(u'\U00010427\U00010427'.lower(),
469 u'\U0001044F\U0001044F')
470 self.assertEqual(u'\U00010427\U0001044F'.lower(),
471 u'\U0001044F\U0001044F')
472 self.assertEqual(u'X\U00010427x\U0001044F'.lower(),
473 u'x\U0001044Fx\U0001044F')
474
475 @requires_wide_build
476 def test_upper(self):
477 string_tests.CommonTest.test_upper(self)
478 self.assertEqual(u'\U0001044F'.upper(), u'\U00010427')
479 self.assertEqual(u'\U0001044F\U0001044F'.upper(),
480 u'\U00010427\U00010427')
481 self.assertEqual(u'\U00010427\U0001044F'.upper(),
482 u'\U00010427\U00010427')
483 self.assertEqual(u'X\U00010427x\U0001044F'.upper(),
484 u'X\U00010427X\U00010427')
485
486 @requires_wide_build
487 def test_capitalize(self):
488 string_tests.CommonTest.test_capitalize(self)
489 self.assertEqual(u'\U0001044F'.capitalize(), u'\U00010427')
490 self.assertEqual(u'\U0001044F\U0001044F'.capitalize(),
491 u'\U00010427\U0001044F')
492 self.assertEqual(u'\U00010427\U0001044F'.capitalize(),
493 u'\U00010427\U0001044F')
494 self.assertEqual(u'\U0001044F\U00010427'.capitalize(),
495 u'\U00010427\U0001044F')
496 self.assertEqual(u'X\U00010427x\U0001044F'.capitalize(),
497 u'X\U0001044Fx\U0001044F')
498
499 @requires_wide_build
500 def test_title(self):
501 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
502 self.assertEqual(u'\U0001044F'.title(), u'\U00010427')
503 self.assertEqual(u'\U0001044F\U0001044F'.title(),
504 u'\U00010427\U0001044F')
505 self.assertEqual(u'\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
506 u'\U00010427\U0001044F \U00010427\U0001044F')
507 self.assertEqual(u'\U00010427\U0001044F \U00010427\U0001044F'.title(),
508 u'\U00010427\U0001044F \U00010427\U0001044F')
509 self.assertEqual(u'\U0001044F\U00010427 \U0001044F\U00010427'.title(),
510 u'\U00010427\U0001044F \U00010427\U0001044F')
511 self.assertEqual(u'X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
512 u'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
513
514 @requires_wide_build
515 def test_swapcase(self):
516 string_tests.CommonTest.test_swapcase(self)
517 self.assertEqual(u'\U0001044F'.swapcase(), u'\U00010427')
518 self.assertEqual(u'\U00010427'.swapcase(), u'\U0001044F')
519 self.assertEqual(u'\U0001044F\U0001044F'.swapcase(),
520 u'\U00010427\U00010427')
521 self.assertEqual(u'\U00010427\U0001044F'.swapcase(),
522 u'\U0001044F\U00010427')
523 self.assertEqual(u'\U0001044F\U00010427'.swapcase(),
524 u'\U00010427\U0001044F')
525 self.assertEqual(u'X\U00010427x\U0001044F'.swapcase(),
526 u'x\U0001044FX\U00010427')
527
Walter Dörwald28256f22003-01-19 16:59:20 +0000528 def test_contains(self):
529 # Testing Unicode contains method
Ezio Melottiaa980582010-01-23 23:04:36 +0000530 self.assertIn('a', u'abdb')
531 self.assertIn('a', u'bdab')
532 self.assertIn('a', u'bdaba')
533 self.assertIn('a', u'bdba')
534 self.assertIn('a', u'bdba')
535 self.assertIn(u'a', u'bdba')
536 self.assertNotIn(u'a', u'bdb')
537 self.assertNotIn(u'a', 'bdb')
538 self.assertIn(u'a', 'bdba')
539 self.assertIn(u'a', ('a',1,None))
540 self.assertIn(u'a', (1,None,'a'))
541 self.assertIn(u'a', (1,None,u'a'))
542 self.assertIn('a', ('a',1,None))
543 self.assertIn('a', (1,None,'a'))
544 self.assertIn('a', (1,None,u'a'))
545 self.assertNotIn('a', ('x',1,u'y'))
546 self.assertNotIn('a', ('x',1,None))
547 self.assertNotIn(u'abcd', u'abcxxxx')
548 self.assertIn(u'ab', u'abcd')
549 self.assertIn('ab', u'abc')
550 self.assertIn(u'ab', 'abc')
551 self.assertIn(u'ab', (1,None,u'ab'))
552 self.assertIn(u'', u'abc')
553 self.assertIn('', u'abc')
Walter Dörwald28256f22003-01-19 16:59:20 +0000554
555 # If the following fails either
556 # the contains operator does not propagate UnicodeErrors or
557 # someone has changed the default encoding
R. David Murray0a0a1a82009-12-14 16:28:26 +0000558 self.assertRaises(UnicodeDecodeError, 'g\xe2teau'.__contains__, u'\xe2')
559 self.assertRaises(UnicodeDecodeError, u'g\xe2teau'.__contains__, '\xe2')
Walter Dörwald28256f22003-01-19 16:59:20 +0000560
Ezio Melottiaa980582010-01-23 23:04:36 +0000561 self.assertIn(u'', '')
562 self.assertIn('', u'')
563 self.assertIn(u'', u'')
564 self.assertIn(u'', 'abc')
565 self.assertIn('', u'abc')
566 self.assertIn(u'', u'abc')
567 self.assertNotIn(u'\0', 'abc')
568 self.assertNotIn('\0', u'abc')
569 self.assertNotIn(u'\0', u'abc')
570 self.assertIn(u'\0', '\0abc')
571 self.assertIn('\0', u'\0abc')
572 self.assertIn(u'\0', u'\0abc')
573 self.assertIn(u'\0', 'abc\0')
574 self.assertIn('\0', u'abc\0')
575 self.assertIn(u'\0', u'abc\0')
576 self.assertIn(u'a', '\0abc')
577 self.assertIn('a', u'\0abc')
578 self.assertIn(u'a', u'\0abc')
579 self.assertIn(u'asdf', 'asdf')
580 self.assertIn('asdf', u'asdf')
581 self.assertIn(u'asdf', u'asdf')
582 self.assertNotIn(u'asdf', 'asd')
583 self.assertNotIn('asdf', u'asd')
584 self.assertNotIn(u'asdf', u'asd')
585 self.assertNotIn(u'asdf', '')
586 self.assertNotIn('asdf', u'')
587 self.assertNotIn(u'asdf', u'')
Walter Dörwald28256f22003-01-19 16:59:20 +0000588
589 self.assertRaises(TypeError, u"abc".__contains__)
R. David Murray0a0a1a82009-12-14 16:28:26 +0000590 self.assertRaises(TypeError, u"abc".__contains__, object())
Walter Dörwald28256f22003-01-19 16:59:20 +0000591
592 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000593 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000594 # Testing Unicode formatting strings...
595 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
596 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
597 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
598 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
599 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
600 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000601 if not sys.platform.startswith('java'):
602 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
603 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000604 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000605
Walter Dörwald43440a62003-03-31 18:07:50 +0000606 self.assertEqual(u'%c' % 0x1234, u'\u1234')
Walter Dörwald44f527f2003-04-02 16:37:24 +0000607 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
Stefan Krah0b9201f2010-07-19 18:06:46 +0000608 self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3))
Walter Dörwald28256f22003-01-19 16:59:20 +0000609
Victor Stinnerf20f9c22010-02-23 23:16:07 +0000610 for num in range(0x00,0x80):
611 char = chr(num)
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000612 self.assertEqual(u"%c" % char, unicode(char))
613 self.assertEqual(u"%c" % num, unicode(char))
614 self.assertTrue(char == u"%c" % char)
615 self.assertTrue(char == u"%c" % num)
Victor Stinnerf20f9c22010-02-23 23:16:07 +0000616 # Issue 7649
617 for num in range(0x80,0x100):
618 uchar = unichr(num)
619 self.assertEqual(uchar, u"%c" % num) # works only with ints
620 self.assertEqual(uchar, u"%c" % uchar) # and unicode chars
621 # the implicit decoding should fail for non-ascii chars
622 self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num))
623 self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num))
624
Walter Dörwald28256f22003-01-19 16:59:20 +0000625 # formatting jobs delegated from the string implementation:
626 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
627 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
628 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
629 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
630 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
631 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
632 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
633 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
634 self.assertEqual('...%s...' % u"abc", u'...abc...')
635 self.assertEqual('%*s' % (5,u'abc',), u' abc')
636 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
637 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
638 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
639 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000640 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
Walter Dörwald43440a62003-03-31 18:07:50 +0000641 self.assertEqual('%c' % u'a', u'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000642 class Wrapper:
643 def __str__(self):
644 return u'\u1234'
645 self.assertEqual('%s' % Wrapper(), u'\u1234')
Tim Peters4511a712006-05-03 04:46:14 +0000646
Ezio Melottie3685f62011-04-26 05:12:51 +0300647 def test_startswith_endswith_errors(self):
648 for meth in (u'foo'.startswith, u'foo'.endswith):
649 with self.assertRaises(UnicodeDecodeError):
650 meth('\xff')
651 with self.assertRaises(TypeError) as cm:
652 meth(['f'])
653 exc = str(cm.exception)
654 self.assertIn('unicode', exc)
655 self.assertIn('str', exc)
656 self.assertIn('tuple', exc)
657
Georg Brandlde9b6242006-04-30 11:13:56 +0000658 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000659 def test_format_float(self):
Georg Brandlde9b6242006-04-30 11:13:56 +0000660 # should not format with a comma, but always with C locale
661 self.assertEqual(u'1.0', u'%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +0000662
Walter Dörwald28256f22003-01-19 16:59:20 +0000663 def test_constructor(self):
664 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
665
666 self.assertEqual(
667 unicode(u'unicode remains unicode'),
668 u'unicode remains unicode'
669 )
670
671 class UnicodeSubclass(unicode):
Marc-André Lemburg79f57832002-12-29 19:44:06 +0000672 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +0000673
Walter Dörwald28256f22003-01-19 16:59:20 +0000674 self.assertEqual(
675 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
676 u'unicode subclass becomes unicode'
677 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000678
Walter Dörwald28256f22003-01-19 16:59:20 +0000679 self.assertEqual(
680 unicode('strings are converted to unicode'),
681 u'strings are converted to unicode'
682 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000683
Walter Dörwald28256f22003-01-19 16:59:20 +0000684 class UnicodeCompat:
685 def __init__(self, x):
686 self.x = x
687 def __unicode__(self):
688 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000689
Walter Dörwald28256f22003-01-19 16:59:20 +0000690 self.assertEqual(
691 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
692 u'__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000693
Walter Dörwald28256f22003-01-19 16:59:20 +0000694 class StringCompat:
695 def __init__(self, x):
696 self.x = x
697 def __str__(self):
698 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000699
Walter Dörwald28256f22003-01-19 16:59:20 +0000700 self.assertEqual(
701 unicode(StringCompat('__str__ compatible objects are recognized')),
702 u'__str__ compatible objects are recognized'
703 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000704
Walter Dörwald28256f22003-01-19 16:59:20 +0000705 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000706
Walter Dörwald28256f22003-01-19 16:59:20 +0000707 o = StringCompat('unicode(obj) is compatible to str()')
708 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
709 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000710
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000711 # %-formatting and .__unicode__()
712 self.assertEqual(u'%s' %
713 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
714 u"u'%s' % obj uses obj.__unicode__()")
715 self.assertEqual(u'%s' %
716 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
717 u"u'%s' % obj falls back to obj.__str__()")
718
Walter Dörwald28256f22003-01-19 16:59:20 +0000719 for obj in (123, 123.45, 123L):
720 self.assertEqual(unicode(obj), unicode(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000721
Walter Dörwald28256f22003-01-19 16:59:20 +0000722 # unicode(obj, encoding, error) tests (this maps to
723 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000724
Walter Dörwald28256f22003-01-19 16:59:20 +0000725 if not sys.platform.startswith('java'):
726 self.assertRaises(
727 TypeError,
728 unicode,
729 u'decoding unicode is not supported',
730 'utf-8',
731 'strict'
732 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000733
Walter Dörwald28256f22003-01-19 16:59:20 +0000734 self.assertEqual(
735 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
736 u'strings are decoded to unicode'
737 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000738
Walter Dörwald28256f22003-01-19 16:59:20 +0000739 if not sys.platform.startswith('java'):
Florent Xicluna6de9e932010-03-07 12:18:33 +0000740 with test_support.check_py3k_warnings():
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000741 buf = buffer('character buffers are decoded to unicode')
Walter Dörwald28256f22003-01-19 16:59:20 +0000742 self.assertEqual(
743 unicode(
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000744 buf,
Walter Dörwald28256f22003-01-19 16:59:20 +0000745 'utf-8',
746 'strict'
747 ),
748 u'character buffers are decoded to unicode'
749 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000750
Walter Dörwald28256f22003-01-19 16:59:20 +0000751 self.assertRaises(TypeError, unicode, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000752
Walter Dörwald28256f22003-01-19 16:59:20 +0000753 def test_codecs_utf7(self):
754 utfTests = [
755 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
756 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
757 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
758 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
759 (u'+', '+-'),
760 (u'+-', '+--'),
761 (u'+?', '+-?'),
762 (u'\?', '+AFw?'),
763 (u'+?', '+-?'),
764 (ur'\\?', '+AFwAXA?'),
765 (ur'\\\?', '+AFwAXABc?'),
Antoine Pitrou653dece2009-05-04 18:32:32 +0000766 (ur'++--', '+-+---'),
767 (u'\U000abcde', '+2m/c3g-'), # surrogate pairs
768 (u'/', '/'),
Walter Dörwald28256f22003-01-19 16:59:20 +0000769 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000770
Walter Dörwald28256f22003-01-19 16:59:20 +0000771 for (x, y) in utfTests:
772 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000773
Antoine Pitrou653dece2009-05-04 18:32:32 +0000774 # Unpaired surrogates not supported
Walter Dörwald28256f22003-01-19 16:59:20 +0000775 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000776
Antoine Pitrou653dece2009-05-04 18:32:32 +0000777 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000778
Antoine Pitrou653dece2009-05-04 18:32:32 +0000779 # Direct encoded characters
780 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
781 # Optional direct characters
782 set_o = '!"#$%&*;<=>@[]^_`{|}'
783 for c in set_d:
784 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000785 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
786 self.assertTrue(c == c.encode('ascii').decode('utf7'))
Antoine Pitrou653dece2009-05-04 18:32:32 +0000787 for c in set_o:
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000788 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
789 self.assertTrue(c == c.encode('ascii').decode('utf7'))
Antoine Pitrou4982d5d2008-07-25 17:45:59 +0000790
Walter Dörwald28256f22003-01-19 16:59:20 +0000791 def test_codecs_utf8(self):
792 self.assertEqual(u''.encode('utf-8'), '')
793 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
794 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
795 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
796 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
797 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
798 self.assertEqual(
799 (u'\ud800\udc02'*1000).encode('utf-8'),
800 '\xf0\x90\x80\x82'*1000
801 )
802 self.assertEqual(
803 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
804 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
805 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
806 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
807 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
808 u' Nunstuck git und'.encode('utf-8'),
809 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
810 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
811 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
812 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
813 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
814 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
815 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
816 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
817 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
818 '\xe3\x80\x8cWenn ist das Nunstuck git und'
819 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000820
Walter Dörwald28256f22003-01-19 16:59:20 +0000821 # UTF-8 specific decoding tests
Florent Xicluna9b90cd12010-09-13 07:46:37 +0000822 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456')
823 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002')
824 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000825
Walter Dörwald28256f22003-01-19 16:59:20 +0000826 # Other possible utf-8 test cases:
827 # * strict decoding testing for all of the
828 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000829
Ezio Melottie57e50c2010-06-05 17:51:07 +0000830 def test_utf8_decode_valid_sequences(self):
831 sequences = [
832 # single byte
833 ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
834 # 2 bytes
835 ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
836 # 3 bytes
837 ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
838 ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
839 # 4 bytes
840 ('\xF0\x90\x80\x80', u'\U00010000'),
841 ('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
842 ]
843 for seq, res in sequences:
844 self.assertEqual(seq.decode('utf-8'), res)
845
846 for ch in map(unichr, range(0, sys.maxunicode)):
847 self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
848
849 def test_utf8_decode_invalid_sequences(self):
850 # continuation bytes in a sequence of 2, 3, or 4 bytes
851 continuation_bytes = map(chr, range(0x80, 0xC0))
852 # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
853 invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
854 # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
855 invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
856 invalid_start_bytes = (
857 continuation_bytes + invalid_2B_seq_start_bytes +
858 invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
859 )
860
861 for byte in invalid_start_bytes:
862 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
863
864 for sb in invalid_2B_seq_start_bytes:
865 for cb in continuation_bytes:
866 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
867
868 for sb in invalid_4B_seq_start_bytes:
869 for cb1 in continuation_bytes[:3]:
870 for cb3 in continuation_bytes[:3]:
871 self.assertRaises(UnicodeDecodeError,
872 (sb+cb1+'\x80'+cb3).decode, 'utf-8')
873
874 for cb in map(chr, range(0x80, 0xA0)):
875 self.assertRaises(UnicodeDecodeError,
876 ('\xE0'+cb+'\x80').decode, 'utf-8')
877 self.assertRaises(UnicodeDecodeError,
878 ('\xE0'+cb+'\xBF').decode, 'utf-8')
879 # XXX: surrogates shouldn't be valid UTF-8!
880 # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
881 # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
882 #for cb in map(chr, range(0xA0, 0xC0)):
Ezio Melottie57e50c2010-06-05 17:51:07 +0000883 #self.assertRaises(UnicodeDecodeError,
884 #('\xED'+cb+'\x80').decode, 'utf-8')
885 #self.assertRaises(UnicodeDecodeError,
886 #('\xED'+cb+'\xBF').decode, 'utf-8')
Ezio Melotti370d85c2011-02-28 01:42:29 +0000887 # but since they are valid on Python 2 add a test for that:
888 for cb, surrogate in zip(map(chr, range(0xA0, 0xC0)),
889 map(unichr, range(0xd800, 0xe000, 64))):
890 encoded = '\xED'+cb+'\x80'
891 self.assertEqual(encoded.decode('utf-8'), surrogate)
892 self.assertEqual(surrogate.encode('utf-8'), encoded)
893
Ezio Melottie57e50c2010-06-05 17:51:07 +0000894 for cb in map(chr, range(0x80, 0x90)):
895 self.assertRaises(UnicodeDecodeError,
896 ('\xF0'+cb+'\x80\x80').decode, 'utf-8')
897 self.assertRaises(UnicodeDecodeError,
898 ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
899 for cb in map(chr, range(0x90, 0xC0)):
900 self.assertRaises(UnicodeDecodeError,
901 ('\xF4'+cb+'\x80\x80').decode, 'utf-8')
902 self.assertRaises(UnicodeDecodeError,
903 ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
904
905 def test_issue8271(self):
Ezio Melottiab2eb0e2010-06-05 19:21:32 +0000906 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
907 # only the start byte and the continuation byte(s) are now considered
908 # invalid, instead of the number of bytes specified by the start byte.
Ezio Melottie57e50c2010-06-05 17:51:07 +0000909 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
910 # table 3-8, Row 2) for more information about the algorithm used.
911 FFFD = u'\ufffd'
912 sequences = [
913 # invalid start bytes
914 ('\x80', FFFD), # continuation byte
915 ('\x80\x80', FFFD*2), # 2 continuation bytes
916 ('\xc0', FFFD),
917 ('\xc0\xc0', FFFD*2),
918 ('\xc1', FFFD),
919 ('\xc1\xc0', FFFD*2),
920 ('\xc0\xc1', FFFD*2),
921 # with start byte of a 2-byte sequence
922 ('\xc2', FFFD), # only the start byte
923 ('\xc2\xc2', FFFD*2), # 2 start bytes
924 ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
925 ('\xc2\x41', FFFD+'A'), # invalid continuation byte
926 # with start byte of a 3-byte sequence
927 ('\xe1', FFFD), # only the start byte
928 ('\xe1\xe1', FFFD*2), # 2 start bytes
929 ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
930 ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
931 ('\xe1\x80', FFFD), # only 1 continuation byte
932 ('\xe1\x41', FFFD+'A'), # invalid continuation byte
933 ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
934 ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
935 ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
936 ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
937 ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
938 # with start byte of a 4-byte sequence
939 ('\xf1', FFFD), # only the start byte
940 ('\xf1\xf1', FFFD*2), # 2 start bytes
941 ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
942 ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
943 ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
944 ('\xf1\x80', FFFD), # only 1 continuation bytes
945 ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
946 ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
947 ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
948 ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
949 ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
950 ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
951 ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
952 ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
953 ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
954 ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
955 ('\xf1\xf1\x80\x41', FFFD*2+'A'),
956 ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
957 # with invalid start byte of a 4-byte sequence (rfc2279)
958 ('\xf5', FFFD), # only the start byte
959 ('\xf5\xf5', FFFD*2), # 2 start bytes
960 ('\xf5\x80', FFFD*2), # only 1 continuation byte
961 ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
962 ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
963 ('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
964 ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
965 ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
966 # with invalid start byte of a 5-byte sequence (rfc2279)
967 ('\xf8', FFFD), # only the start byte
968 ('\xf8\xf8', FFFD*2), # 2 start bytes
969 ('\xf8\x80', FFFD*2), # only one continuation byte
970 ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
971 ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
972 # with invalid start byte of a 6-byte sequence (rfc2279)
973 ('\xfc', FFFD), # only the start byte
974 ('\xfc\xfc', FFFD*2), # 2 start bytes
975 ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
976 ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
977 # invalid start byte
978 ('\xfe', FFFD),
979 ('\xfe\x80\x80', FFFD*3),
980 # other sequences
981 ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
982 ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
983 ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
984 ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
985 u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
986 ]
987 for n, (seq, res) in enumerate(sequences):
988 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
989 self.assertEqual(seq.decode('utf-8', 'replace'), res)
990 self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
991 self.assertEqual(seq.decode('utf-8', 'ignore'),
992 res.replace(u'\uFFFD', ''))
993
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000994 def test_codecs_idna(self):
995 # Test whether trailing dot is preserved
996 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
997
Walter Dörwald28256f22003-01-19 16:59:20 +0000998 def test_codecs_errors(self):
999 # Error handling (encoding)
1000 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
1001 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
1002 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
1003 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
Benjamin Peterson332d7212009-09-18 21:14:55 +00001004 self.assertEqual(u'Andr\202 x'.encode('ascii', 'replace'),
1005 u'Andr\202 x'.encode('ascii', errors='replace'))
1006 self.assertEqual(u'Andr\202 x'.encode('ascii', 'ignore'),
1007 u'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001008
Walter Dörwald28256f22003-01-19 16:59:20 +00001009 # Error handling (decoding)
1010 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
1011 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
1012 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
1013 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
Benjamin Peterson332d7212009-09-18 21:14:55 +00001014 self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
1015 u'abcde'.decode('ascii', errors='ignore'))
1016 self.assertEqual(u'abcde'.decode('ascii', 'replace'),
1017 u'abcde'.decode(encoding='ascii', errors='replace'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001018
Walter Dörwald28256f22003-01-19 16:59:20 +00001019 # Error handling (unknown character names)
1020 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001021
Walter Dörwald28256f22003-01-19 16:59:20 +00001022 # Error handling (truncated escape sequence)
1023 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001024
Walter Dörwald28256f22003-01-19 16:59:20 +00001025 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
1026 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
1027 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
1028 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
1029 # executes PyUnicode_Encode()
1030 import imp
1031 self.assertRaises(
1032 ImportError,
1033 imp.find_module,
1034 "non-existing module",
1035 [u"non-existing dir"]
1036 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001037
Walter Dörwald28256f22003-01-19 16:59:20 +00001038 # Error handling (wrong arguments)
1039 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001040
Walter Dörwald28256f22003-01-19 16:59:20 +00001041 # Error handling (PyUnicode_EncodeDecimal())
1042 self.assertRaises(UnicodeError, int, u"\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +00001043
Walter Dörwald28256f22003-01-19 16:59:20 +00001044 def test_codecs(self):
1045 # Encoding
1046 self.assertEqual(u'hello'.encode('ascii'), 'hello')
1047 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
1048 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
1049 self.assertEqual(u'hello'.encode('utf8'), 'hello')
1050 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
1051 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
1052 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00001053
Walter Dörwald28256f22003-01-19 16:59:20 +00001054 # Roundtrip safety for BMP (just the first 1024 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001055 for c in xrange(1024):
1056 u = unichr(c)
1057 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1058 'utf-16-be', 'raw_unicode_escape',
1059 'unicode_escape', 'unicode_internal'):
1060 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001061
Walter Dörwald28256f22003-01-19 16:59:20 +00001062 # Roundtrip safety for BMP (just the first 256 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001063 for c in xrange(256):
1064 u = unichr(c)
1065 for encoding in ('latin-1',):
1066 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001067
Walter Dörwald28256f22003-01-19 16:59:20 +00001068 # Roundtrip safety for BMP (just the first 128 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001069 for c in xrange(128):
1070 u = unichr(c)
1071 for encoding in ('ascii',):
1072 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001073
Walter Dörwald28256f22003-01-19 16:59:20 +00001074 # Roundtrip safety for non-BMP (just a few chars)
1075 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
1076 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1077 #'raw_unicode_escape',
1078 'unicode_escape', 'unicode_internal'):
1079 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001080
Walter Dörwald28256f22003-01-19 16:59:20 +00001081 # UTF-8 must be roundtrip safe for all UCS-2 code points
1082 # This excludes surrogates: in the full range, there would be
1083 # a surrogate pair (\udbff\udc00), which gets converted back
1084 # to a non-BMP character (\U0010fc00)
1085 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
1086 for encoding in ('utf-8',):
1087 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001088
Walter Dörwald28256f22003-01-19 16:59:20 +00001089 def test_codecs_charmap(self):
1090 # 0-127
1091 s = ''.join(map(chr, xrange(128)))
1092 for encoding in (
1093 'cp037', 'cp1026',
Georg Brandlf0757a22010-05-24 21:29:07 +00001094 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1095 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001096 'cp863', 'cp865', 'cp866',
1097 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1098 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1099 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1100 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001101
Walter Dörwald28256f22003-01-19 16:59:20 +00001102 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1103 'cp1256', 'cp1257', 'cp1258',
1104 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001105
Walter Dörwald28256f22003-01-19 16:59:20 +00001106 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1107 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001108
Walter Dörwald28256f22003-01-19 16:59:20 +00001109 ### These have undefined mappings:
1110 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001111
Walter Dörwald28256f22003-01-19 16:59:20 +00001112 ### These fail the round-trip:
1113 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00001114
Walter Dörwald28256f22003-01-19 16:59:20 +00001115 ):
1116 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001117
Walter Dörwald28256f22003-01-19 16:59:20 +00001118 # 128-255
1119 s = ''.join(map(chr, xrange(128, 256)))
1120 for encoding in (
1121 'cp037', 'cp1026',
Georg Brandlf0757a22010-05-24 21:29:07 +00001122 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1123 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001124 'cp863', 'cp865', 'cp866',
1125 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1126 'iso8859_2', 'iso8859_4', 'iso8859_5',
1127 'iso8859_9', 'koi8_r', 'latin_1',
1128 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00001129
Walter Dörwald28256f22003-01-19 16:59:20 +00001130 ### These have undefined mappings:
1131 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1132 #'cp1256', 'cp1257', 'cp1258',
1133 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1134 #'iso8859_3', 'iso8859_6', 'iso8859_7',
1135 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00001136
Walter Dörwald28256f22003-01-19 16:59:20 +00001137 ### These fail the round-trip:
1138 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00001139
Walter Dörwald28256f22003-01-19 16:59:20 +00001140 ):
1141 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001142
Walter Dörwald28256f22003-01-19 16:59:20 +00001143 def test_concatenation(self):
1144 self.assertEqual((u"abc" u"def"), u"abcdef")
1145 self.assertEqual(("abc" u"def"), u"abcdef")
1146 self.assertEqual((u"abc" "def"), u"abcdef")
1147 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
1148 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00001149
Walter Dörwald28256f22003-01-19 16:59:20 +00001150 def test_printing(self):
1151 class BitBucket:
1152 def write(self, text):
1153 pass
Fred Drake004d5e62000-10-23 17:22:08 +00001154
Walter Dörwald28256f22003-01-19 16:59:20 +00001155 out = BitBucket()
1156 print >>out, u'abc'
1157 print >>out, u'abc', u'def'
1158 print >>out, u'abc', 'def'
1159 print >>out, 'abc', u'def'
1160 print >>out, u'abc\n'
1161 print >>out, u'abc\n',
1162 print >>out, u'abc\n',
1163 print >>out, u'def\n'
1164 print >>out, u'def\n'
Fred Drake004d5e62000-10-23 17:22:08 +00001165
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001166 def test_ucs4(self):
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001167 x = u'\U00100000'
1168 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
1169 self.assertEqual(x, y)
1170
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00001171 y = r'\U00100000'
1172 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1173 self.assertEqual(x, y)
1174 y = r'\U00010000'
1175 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1176 self.assertEqual(x, y)
1177
1178 try:
1179 '\U11111111'.decode("raw-unicode-escape")
1180 except UnicodeDecodeError as e:
1181 self.assertEqual(e.start, 0)
1182 self.assertEqual(e.end, 10)
1183 else:
1184 self.fail("Should have raised UnicodeDecodeError")
1185
Brett Cannonc3647ac2005-04-26 03:45:26 +00001186 def test_conversion(self):
1187 # Make sure __unicode__() works properly
1188 class Foo0:
1189 def __str__(self):
1190 return "foo"
1191
1192 class Foo1:
1193 def __unicode__(self):
1194 return u"foo"
1195
1196 class Foo2(object):
1197 def __unicode__(self):
1198 return u"foo"
1199
1200 class Foo3(object):
1201 def __unicode__(self):
1202 return "foo"
1203
1204 class Foo4(str):
1205 def __unicode__(self):
1206 return "foo"
1207
1208 class Foo5(unicode):
1209 def __unicode__(self):
1210 return "foo"
1211
1212 class Foo6(str):
1213 def __str__(self):
1214 return "foos"
1215
1216 def __unicode__(self):
1217 return u"foou"
1218
1219 class Foo7(unicode):
1220 def __str__(self):
1221 return "foos"
1222 def __unicode__(self):
1223 return u"foou"
1224
1225 class Foo8(unicode):
1226 def __new__(cls, content=""):
1227 return unicode.__new__(cls, 2*content)
1228 def __unicode__(self):
1229 return self
1230
1231 class Foo9(unicode):
1232 def __str__(self):
1233 return "string"
1234 def __unicode__(self):
1235 return "not unicode"
1236
1237 self.assertEqual(unicode(Foo0()), u"foo")
1238 self.assertEqual(unicode(Foo1()), u"foo")
1239 self.assertEqual(unicode(Foo2()), u"foo")
1240 self.assertEqual(unicode(Foo3()), u"foo")
1241 self.assertEqual(unicode(Foo4("bar")), u"foo")
1242 self.assertEqual(unicode(Foo5("bar")), u"foo")
1243 self.assertEqual(unicode(Foo6("bar")), u"foou")
1244 self.assertEqual(unicode(Foo7("bar")), u"foou")
1245 self.assertEqual(unicode(Foo8("foo")), u"foofoo")
1246 self.assertEqual(str(Foo9("foo")), "string")
1247 self.assertEqual(unicode(Foo9("foo")), u"not unicode")
1248
Anthony Baxter67b6d512006-03-30 10:54:07 +00001249 def test_unicode_repr(self):
1250 class s1:
1251 def __repr__(self):
1252 return '\\n'
1253
1254 class s2:
1255 def __repr__(self):
1256 return u'\\n'
1257
1258 self.assertEqual(repr(s1()), '\\n')
1259 self.assertEqual(repr(s2()), '\\n')
1260
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00001261 def test_expandtabs_overflows_gracefully(self):
1262 # This test only affects 32-bit platforms because expandtabs can only take
1263 # an int as the max value, not a 64-bit C long. If expandtabs is changed
1264 # to take a 64-bit long, this test should apply to all platforms.
Neal Norwitzba965de2007-06-11 02:14:39 +00001265 if sys.maxint > (1 << 32) or struct.calcsize('P') != 4:
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00001266 return
1267 self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
Anthony Baxter67b6d512006-03-30 10:54:07 +00001268
Eric Smitha9f7d622008-02-17 19:46:49 +00001269 def test__format__(self):
1270 def test(value, format, expected):
1271 # test both with and without the trailing 's'
1272 self.assertEqual(value.__format__(format), expected)
1273 self.assertEqual(value.__format__(format + u's'), expected)
1274
1275 test(u'', u'', u'')
1276 test(u'abc', u'', u'abc')
1277 test(u'abc', u'.3', u'abc')
1278 test(u'ab', u'.3', u'ab')
1279 test(u'abcdef', u'.3', u'abc')
1280 test(u'abcdef', u'.0', u'')
1281 test(u'abc', u'3.3', u'abc')
1282 test(u'abc', u'2.3', u'abc')
1283 test(u'abc', u'2.2', u'ab')
1284 test(u'abc', u'3.2', u'ab ')
1285 test(u'result', u'x<0', u'result')
1286 test(u'result', u'x<5', u'result')
1287 test(u'result', u'x<6', u'result')
1288 test(u'result', u'x<7', u'resultx')
1289 test(u'result', u'x<8', u'resultxx')
1290 test(u'result', u' <7', u'result ')
1291 test(u'result', u'<7', u'result ')
1292 test(u'result', u'>7', u' result')
1293 test(u'result', u'>8', u' result')
1294 test(u'result', u'^8', u' result ')
1295 test(u'result', u'^9', u' result ')
1296 test(u'result', u'^10', u' result ')
1297 test(u'a', u'10000', u'a' + u' ' * 9999)
1298 test(u'', u'10000', u' ' * 10000)
1299 test(u'', u'10000000', u' ' * 10000000)
1300
1301 # test mixing unicode and str
1302 self.assertEqual(u'abc'.__format__('s'), u'abc')
1303 self.assertEqual(u'abc'.__format__('->10s'), u'-------abc')
1304
1305 def test_format(self):
1306 self.assertEqual(u''.format(), u'')
1307 self.assertEqual(u'a'.format(), u'a')
1308 self.assertEqual(u'ab'.format(), u'ab')
1309 self.assertEqual(u'a{{'.format(), u'a{')
1310 self.assertEqual(u'a}}'.format(), u'a}')
1311 self.assertEqual(u'{{b'.format(), u'{b')
1312 self.assertEqual(u'}}b'.format(), u'}b')
1313 self.assertEqual(u'a{{b'.format(), u'a{b')
1314
1315 # examples from the PEP:
1316 import datetime
1317 self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred")
1318 self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')),
1319 u"My name is Fred")
1320 self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'),
1321 u"My name is Fred :-{}")
1322
1323 # datetime.__format__ doesn't work with unicode
1324 #d = datetime.date(2007, 8, 18)
1325 #self.assertEqual("The year is {0.year}".format(d),
1326 # "The year is 2007")
1327
1328 # classes we'll use for testing
1329 class C:
1330 def __init__(self, x=100):
1331 self._x = x
1332 def __format__(self, spec):
1333 return spec
1334
1335 class D:
1336 def __init__(self, x):
1337 self.x = x
1338 def __format__(self, spec):
1339 return str(self.x)
1340
1341 # class with __str__, but no __format__
1342 class E:
1343 def __init__(self, x):
1344 self.x = x
1345 def __str__(self):
1346 return u'E(' + self.x + u')'
1347
1348 # class with __repr__, but no __format__ or __str__
1349 class F:
1350 def __init__(self, x):
1351 self.x = x
1352 def __repr__(self):
1353 return u'F(' + self.x + u')'
1354
1355 # class with __format__ that forwards to string, for some format_spec's
1356 class G:
1357 def __init__(self, x):
1358 self.x = x
1359 def __str__(self):
1360 return u"string is " + self.x
1361 def __format__(self, format_spec):
1362 if format_spec == 'd':
1363 return u'G(' + self.x + u')'
1364 return object.__format__(self, format_spec)
1365
1366 # class that returns a bad type from __format__
1367 class H:
1368 def __format__(self, format_spec):
1369 return 1.0
1370
1371 class I(datetime.date):
1372 def __format__(self, format_spec):
1373 return self.strftime(format_spec)
1374
1375 class J(int):
1376 def __format__(self, format_spec):
1377 return int.__format__(self * 2, format_spec)
1378
1379
1380 self.assertEqual(u''.format(), u'')
1381 self.assertEqual(u'abc'.format(), u'abc')
1382 self.assertEqual(u'{0}'.format(u'abc'), u'abc')
1383 self.assertEqual(u'{0:}'.format(u'abc'), u'abc')
1384 self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc')
1385 self.assertEqual(u'{0}X'.format(u'abc'), u'abcX')
1386 self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY')
1387 self.assertEqual(u'{1}'.format(1, u'abc'), u'abc')
1388 self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc')
1389 self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX')
1390 self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY')
1391 self.assertEqual(u'{0}'.format(-15), u'-15')
1392 self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc')
1393 self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc')
1394 self.assertEqual(u'{{'.format(), u'{')
1395 self.assertEqual(u'}}'.format(), u'}')
1396 self.assertEqual(u'{{}}'.format(), u'{}')
1397 self.assertEqual(u'{{x}}'.format(), u'{x}')
1398 self.assertEqual(u'{{{0}}}'.format(123), u'{123}')
1399 self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}')
1400 self.assertEqual(u'}}{{'.format(), u'}{')
1401 self.assertEqual(u'}}x{{'.format(), u'}x{')
1402
1403 # weird field names
1404 self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz')
1405 self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz')
1406 self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3')
1407
1408 self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20')
1409 self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010')
1410 self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc')
1411 self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc')
1412 self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def')
1413 self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def')
1414 self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def')
1415
1416 # strings
1417 self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc')
1418 self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab')
1419 self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc')
1420 self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'')
1421 self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc')
1422 self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc')
1423 self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab')
1424 self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ')
1425 self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result')
1426 self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result')
1427 self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result')
1428 self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx')
1429 self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx')
1430 self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ')
1431 self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ')
1432 self.assertEqual(u'{0:>7s}'.format(u'result'), u' result')
1433 self.assertEqual(u'{0:>8s}'.format(u'result'), u' result')
1434 self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ')
1435 self.assertEqual(u'{0:^9s}'.format(u'result'), u' result ')
1436 self.assertEqual(u'{0:^10s}'.format(u'result'), u' result ')
1437 self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999)
1438 self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000)
1439 self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000)
1440
1441 # format specifiers for user defined type
1442 self.assertEqual(u'{0:abc}'.format(C()), u'abc')
1443
Florent Xicluna60d512c2010-09-13 08:21:43 +00001444 # !r and !s coercions
Eric Smitha9f7d622008-02-17 19:46:49 +00001445 self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello')
1446 self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello')
1447 self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello ')
1448 self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello ')
1449 self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'")
1450 self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'")
1451 self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)')
1452
1453 # test fallback to object.__format__
1454 self.assertEqual(u'{0}'.format({}), u'{}')
1455 self.assertEqual(u'{0}'.format([]), u'[]')
1456 self.assertEqual(u'{0}'.format([1]), u'[1]')
1457 self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)')
Eric Smitha9f7d622008-02-17 19:46:49 +00001458 self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)')
Eric Smitha9f7d622008-02-17 19:46:49 +00001459 self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data')
1460
Florent Xicluna60d512c2010-09-13 08:21:43 +00001461 msg = 'object.__format__ with a non-empty format string is deprecated'
1462 with test_support.check_warnings((msg, PendingDeprecationWarning)):
1463 self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data) ')
1464 self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data) ')
1465 self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')
1466
Eric Smith2ace4cf2009-03-14 14:37:38 +00001467 self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007,
1468 month=8,
1469 day=27)),
1470 u"date: 2007-08-27")
Eric Smitha9f7d622008-02-17 19:46:49 +00001471
1472 # test deriving from a builtin type and overriding __format__
Eric Smith2ace4cf2009-03-14 14:37:38 +00001473 self.assertEqual(u"{0}".format(J(10)), u"20")
Eric Smitha9f7d622008-02-17 19:46:49 +00001474
1475
1476 # string format specifiers
Eric Smith2ace4cf2009-03-14 14:37:38 +00001477 self.assertEqual(u'{0:}'.format('a'), u'a')
Eric Smitha9f7d622008-02-17 19:46:49 +00001478
1479 # computed format specifiers
Eric Smith2ace4cf2009-03-14 14:37:38 +00001480 self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello')
1481 self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello')
1482 self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello')
1483 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello ')
1484 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello ')
Eric Smitha9f7d622008-02-17 19:46:49 +00001485
1486 # test various errors
Eric Smith2ace4cf2009-03-14 14:37:38 +00001487 self.assertRaises(ValueError, u'{'.format)
1488 self.assertRaises(ValueError, u'}'.format)
1489 self.assertRaises(ValueError, u'a{'.format)
1490 self.assertRaises(ValueError, u'a}'.format)
1491 self.assertRaises(ValueError, u'{a'.format)
1492 self.assertRaises(ValueError, u'}a'.format)
1493 self.assertRaises(IndexError, u'{0}'.format)
1494 self.assertRaises(IndexError, u'{1}'.format, u'abc')
1495 self.assertRaises(KeyError, u'{x}'.format)
1496 self.assertRaises(ValueError, u"}{".format)
1497 self.assertRaises(ValueError, u"{".format)
1498 self.assertRaises(ValueError, u"}".format)
1499 self.assertRaises(ValueError, u"abc{0:{}".format)
1500 self.assertRaises(ValueError, u"{0".format)
1501 self.assertRaises(IndexError, u"{0.}".format)
1502 self.assertRaises(ValueError, u"{0.}".format, 0)
1503 self.assertRaises(IndexError, u"{0[}".format)
1504 self.assertRaises(ValueError, u"{0[}".format, [])
1505 self.assertRaises(KeyError, u"{0]}".format)
1506 self.assertRaises(ValueError, u"{0.[]}".format, 0)
1507 self.assertRaises(ValueError, u"{0..foo}".format, 0)
1508 self.assertRaises(ValueError, u"{0[0}".format, 0)
1509 self.assertRaises(ValueError, u"{0[0:foo}".format, 0)
1510 self.assertRaises(KeyError, u"{c]}".format)
1511 self.assertRaises(ValueError, u"{{ {{{0}}".format, 0)
1512 self.assertRaises(ValueError, u"{0}}".format, 0)
1513 self.assertRaises(KeyError, u"{foo}".format, bar=3)
1514 self.assertRaises(ValueError, u"{0!x}".format, 3)
1515 self.assertRaises(ValueError, u"{0!}".format, 0)
1516 self.assertRaises(ValueError, u"{0!rs}".format, 0)
1517 self.assertRaises(ValueError, u"{!}".format)
1518 self.assertRaises(IndexError, u"{:}".format)
1519 self.assertRaises(IndexError, u"{:s}".format)
1520 self.assertRaises(IndexError, u"{}".format)
Benjamin Petersoneabdeba2010-06-07 22:33:09 +00001521 big = u"23098475029384702983476098230754973209482573"
1522 self.assertRaises(ValueError, (u"{" + big + u"}").format)
1523 self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0])
Eric Smitha9f7d622008-02-17 19:46:49 +00001524
Eric Smith4b94b192009-05-23 13:56:13 +00001525 # issue 6089
1526 self.assertRaises(ValueError, u"{0[0]x}".format, [None])
1527 self.assertRaises(ValueError, u"{0[0](10)}".format, [None])
1528
Eric Smitha9f7d622008-02-17 19:46:49 +00001529 # can't have a replacement on the field name portion
Eric Smith2ace4cf2009-03-14 14:37:38 +00001530 self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4)
Eric Smitha9f7d622008-02-17 19:46:49 +00001531
1532 # exceed maximum recursion depth
Eric Smith2ace4cf2009-03-14 14:37:38 +00001533 self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'')
1534 self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
Eric Smitha9f7d622008-02-17 19:46:49 +00001535 0, 1, 2, 3, 4, 5, 6, 7)
1536
1537 # string format spec errors
Eric Smith2ace4cf2009-03-14 14:37:38 +00001538 self.assertRaises(ValueError, u"{0:-s}".format, u'')
1539 self.assertRaises(ValueError, format, u"", u"-")
1540 self.assertRaises(ValueError, u"{0:=s}".format, u'')
Anthony Baxter67b6d512006-03-30 10:54:07 +00001541
Eric Smithbc32fee2008-02-18 18:02:34 +00001542 # test combining string and unicode
1543 self.assertEqual(u"foo{0}".format('bar'), u'foobar')
1544 # This will try to convert the argument from unicode to str, which
1545 # will succeed
1546 self.assertEqual("foo{0}".format(u'bar'), 'foobar')
1547 # This will try to convert the argument from unicode to str, which
1548 # will fail
1549 self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar')
1550
Eric Smith6f42edb2009-03-14 11:57:26 +00001551 def test_format_auto_numbering(self):
1552 class C:
1553 def __init__(self, x=100):
1554 self._x = x
1555 def __format__(self, spec):
1556 return spec
1557
1558 self.assertEqual(u'{}'.format(10), u'10')
1559 self.assertEqual(u'{:5}'.format('s'), u's ')
1560 self.assertEqual(u'{!r}'.format('s'), u"'s'")
1561 self.assertEqual(u'{._x}'.format(C(10)), u'10')
1562 self.assertEqual(u'{[1]}'.format([1, 2]), u'2')
1563 self.assertEqual(u'{[a]}'.format({'a':4, 'b':2}), u'4')
1564 self.assertEqual(u'a{}b{}c'.format(0, 1), u'a0b1c')
1565
1566 self.assertEqual(u'a{:{}}b'.format('x', '^10'), u'a x b')
1567 self.assertEqual(u'a{:{}x}b'.format(20, '#'), u'a0x14b')
1568
1569 # can't mix and match numbering and auto-numbering
1570 self.assertRaises(ValueError, u'{}{1}'.format, 1, 2)
1571 self.assertRaises(ValueError, u'{1}{}'.format, 1, 2)
1572 self.assertRaises(ValueError, u'{:{1}}'.format, 1, 2)
1573 self.assertRaises(ValueError, u'{0:{}}'.format, 1, 2)
1574
1575 # can mix and match auto-numbering and named
1576 self.assertEqual(u'{f}{}'.format(4, f='test'), u'test4')
1577 self.assertEqual(u'{}{f}'.format(4, f='test'), u'4test')
1578 self.assertEqual(u'{:{f}}{g}{}'.format(1, 3, g='g', f=2), u' 1g3')
1579 self.assertEqual(u'{f:{}}{}{g}'.format(2, 4, f=1, g='g'), u' 14g')
1580
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +00001581 def test_raiseMemError(self):
1582 # Ensure that the freelist contains a consistent object, even
1583 # when a string allocation fails with a MemoryError.
1584 # This used to crash the interpreter,
1585 # or leak references when the number was smaller.
Antoine Pitrou187ac1b2008-09-05 22:04:54 +00001586 charwidth = 4 if sys.maxunicode >= 0x10000 else 2
1587 # Note: sys.maxsize is half of the actual max allocation because of
1588 # the signedness of Py_ssize_t.
1589 alloc = lambda: u"a" * (sys.maxsize // charwidth * 2)
Antoine Pitroufd7c43e2008-08-17 17:01:49 +00001590 self.assertRaises(MemoryError, alloc)
1591 self.assertRaises(MemoryError, alloc)
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +00001592
Victor Stinner95affc42010-03-22 12:24:37 +00001593 def test_format_subclass(self):
1594 class U(unicode):
Victor Stinner95affc42010-03-22 12:24:37 +00001595 def __unicode__(self):
1596 return u'__unicode__ overridden'
1597 u = U(u'xxx')
Florent Xicluna9b90cd12010-09-13 07:46:37 +00001598 self.assertEqual("%s" % u, u'__unicode__ overridden')
Florent Xiclunac0c0b142010-09-13 08:53:00 +00001599 self.assertEqual("{}".format(u), '__unicode__ overridden')
Victor Stinner95affc42010-03-22 12:24:37 +00001600
1601
Walter Dörwald28256f22003-01-19 16:59:20 +00001602def test_main():
Collin Winterc2898c52007-04-25 17:29:52 +00001603 test_support.run_unittest(__name__)
Barry Warsaw817918c2002-08-06 16:58:21 +00001604
Walter Dörwald28256f22003-01-19 16:59:20 +00001605if __name__ == "__main__":
1606 test_main()