blob: 63fb8316b5f0eb994c51fc16e6a744f0f9e5aaf7 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Ezio Melotti12682b12011-08-22 23:46:30 +03008import sys
9import struct
10import codecs
11import unittest
Walter Dörwald0fd583c2003-02-21 12:53:50 +000012from test import test_support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000013
Ezio Melotti12682b12011-08-22 23:46:30 +030014# decorator to skip tests on narrow builds
15requires_wide_build = unittest.skipIf(sys.maxunicode == 65535,
16 'requires wide build')
17
Neal Norwitz430f68b2005-11-24 22:00:56 +000018# Error handling (bad decoder return)
19def search_function(encoding):
20 def decode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 def decode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 if encoding=="test.unicode1":
29 return (encode1, decode1, None, None)
30 elif encoding=="test.unicode2":
31 return (encode2, decode2, None, None)
32 else:
33 return None
34codecs.register(search_function)
35
Serhiy Storchaka8d30ad72015-11-25 15:55:54 +020036class UnicodeSubclass(unicode):
37 pass
38
Walter Dörwald0fd583c2003-02-21 12:53:50 +000039class UnicodeTest(
40 string_tests.CommonTest,
Walter Dörwald57d88e52004-08-26 16:53:04 +000041 string_tests.MixinStrUnicodeUserStringTest,
42 string_tests.MixinStrUnicodeTest,
Walter Dörwald0fd583c2003-02-21 12:53:50 +000043 ):
44 type2test = unicode
45
Florent Xiclunac0c0b142010-09-13 08:53:00 +000046 def assertEqual(self, first, second, msg=None):
47 # strict assertEqual method: reject implicit bytes/unicode equality
48 super(UnicodeTest, self).assertEqual(first, second, msg)
49 if isinstance(first, unicode) or isinstance(second, unicode):
50 self.assertIsInstance(first, unicode)
51 self.assertIsInstance(second, unicode)
52 elif isinstance(first, str) or isinstance(second, str):
53 self.assertIsInstance(first, str)
54 self.assertIsInstance(second, str)
55
Walter Dörwald0fd583c2003-02-21 12:53:50 +000056 def checkequalnofix(self, result, object, methodname, *args):
57 method = getattr(object, methodname)
58 realresult = method(*args)
59 self.assertEqual(realresult, result)
Benjamin Peterson5c8da862009-06-30 22:57:08 +000060 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000061
62 # if the original is returned make sure that
63 # this doesn't happen with subclasses
64 if realresult is object:
65 class usub(unicode):
66 def __repr__(self):
67 return 'usub(%r)' % unicode.__repr__(self)
68 object = usub(object)
69 method = getattr(object, methodname)
70 realresult = method(*args)
71 self.assertEqual(realresult, result)
Benjamin Peterson5c8da862009-06-30 22:57:08 +000072 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000073
Jeremy Hylton504de6b2003-10-06 05:08:26 +000074 def test_literals(self):
75 self.assertEqual(u'\xff', u'\u00ff')
76 self.assertEqual(u'\uffff', u'\U0000ffff')
Kurt B. Kaiserdb98f362007-07-18 19:58:42 +000077 self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'')
78 self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'')
79 self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000)
Jeremy Hylton504de6b2003-10-06 05:08:26 +000080
Walter Dörwald28256f22003-01-19 16:59:20 +000081 def test_repr(self):
82 if not sys.platform.startswith('java'):
83 # Test basic sanity of repr()
84 self.assertEqual(repr(u'abc'), "u'abc'")
85 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
86 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
87 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
88 self.assertEqual(repr(u'\\'), "u'\\\\'")
89 self.assertEqual(repr(u'\n'), "u'\\n'")
90 self.assertEqual(repr(u'\r'), "u'\\r'")
91 self.assertEqual(repr(u'\t'), "u'\\t'")
92 self.assertEqual(repr(u'\b'), "u'\\x08'")
93 self.assertEqual(repr(u"'\""), """u'\\'"'""")
94 self.assertEqual(repr(u"'\""), """u'\\'"'""")
95 self.assertEqual(repr(u"'"), '''u"'"''')
96 self.assertEqual(repr(u'"'), """u'"'""")
97 latin1repr = (
98 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
99 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
100 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
101 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
102 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
103 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
104 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
105 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
106 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
107 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
108 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
109 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
110 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
111 "\\xfe\\xff'")
112 testrepr = repr(u''.join(map(unichr, xrange(256))))
113 self.assertEqual(testrepr, latin1repr)
Neal Norwitz17753ec2006-08-21 22:21:19 +0000114 # Test repr works on wide unicode escapes without overflow.
115 self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
116 repr(u"\U00010000" * 39 + u"\uffff" * 4096))
117
Walter Dörwald28256f22003-01-19 16:59:20 +0000118
Walter Dörwald28256f22003-01-19 16:59:20 +0000119 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000120 string_tests.CommonTest.test_count(self)
121 # check mixed argument types
122 self.checkequalnofix(3, 'aaa', 'count', u'a')
123 self.checkequalnofix(0, 'aaa', 'count', u'b')
124 self.checkequalnofix(3, u'aaa', 'count', 'a')
125 self.checkequalnofix(0, u'aaa', 'count', 'b')
126 self.checkequalnofix(0, u'aaa', 'count', 'b')
127 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
128 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
129 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
130 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000131
Walter Dörwald28256f22003-01-19 16:59:20 +0000132 def test_find(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000133 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
134 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
135 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000136
Walter Dörwald28256f22003-01-19 16:59:20 +0000137 self.assertRaises(TypeError, u'hello'.find)
138 self.assertRaises(TypeError, u'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000139
Walter Dörwald28256f22003-01-19 16:59:20 +0000140 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000141 string_tests.CommonTest.test_rfind(self)
142 # check mixed argument types
143 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
144 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
145 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000146
Walter Dörwald28256f22003-01-19 16:59:20 +0000147 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000148 string_tests.CommonTest.test_index(self)
149 # check mixed argument types
150 for (t1, t2) in ((str, unicode), (unicode, str)):
151 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
152 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
153 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
154 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
155 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
156 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
157 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
158 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000159
Walter Dörwald28256f22003-01-19 16:59:20 +0000160 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000161 string_tests.CommonTest.test_rindex(self)
162 # check mixed argument types
163 for (t1, t2) in ((str, unicode), (unicode, str)):
164 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
165 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
166 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
167 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000168
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000169 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
170 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
171 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
172 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
173 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000174
Walter Dörwald28256f22003-01-19 16:59:20 +0000175 def test_translate(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000176 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
177 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
178 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
179 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
180 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
Walter Dörwaldcd736e72004-02-05 17:36:00 +0000181 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000182
Walter Dörwald28256f22003-01-19 16:59:20 +0000183 self.assertRaises(TypeError, u'hello'.translate)
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000184 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000185
Walter Dörwald28256f22003-01-19 16:59:20 +0000186 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000187 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000188
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000189 # Mixed arguments
190 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
191 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
192 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000193
Walter Dörwald28256f22003-01-19 16:59:20 +0000194 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000195 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000196
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000197 # mixed arguments
198 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
199 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
200 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
201 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
202 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
203 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
204 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
Marc-André Lemburge5034372000-08-08 08:04:29 +0000205
Walter Dörwald28256f22003-01-19 16:59:20 +0000206 def test_strip(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000207 string_tests.CommonTest.test_strip(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000208 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000209
Walter Dörwald28256f22003-01-19 16:59:20 +0000210 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000211 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000212
Walter Dörwald28256f22003-01-19 16:59:20 +0000213 # method call forwarded from str implementation because of unicode argument
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000214 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
Walter Dörwald28256f22003-01-19 16:59:20 +0000215 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000216
Walter Dörwald28256f22003-01-19 16:59:20 +0000217 def test_comparison(self):
218 # Comparisons:
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000219 self.assertTrue(u'abc' == 'abc')
220 self.assertTrue('abc' == u'abc')
221 self.assertTrue(u'abc' == u'abc')
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000222 self.assertTrue(u'abcd' > 'abc')
223 self.assertTrue('abcd' > u'abc')
224 self.assertTrue(u'abcd' > u'abc')
225 self.assertTrue(u'abc' < 'abcd')
226 self.assertTrue('abc' < u'abcd')
227 self.assertTrue(u'abc' < u'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000228
229 if 0:
230 # Move these tests to a Unicode collation module test...
231 # Testing UTF-16 code point order comparisons...
232
233 # No surrogates, no fixup required.
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000234 self.assertTrue(u'\u0061' < u'\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000235 # Non surrogate below surrogate value, no fixup required
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000236 self.assertTrue(u'\u0061' < u'\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000237
238 # Non surrogate above surrogate value, fixup required
239 def test_lecmp(s, s2):
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000240 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000241
242 def test_fixup(s):
243 s2 = u'\ud800\udc01'
244 test_lecmp(s, s2)
245 s2 = u'\ud900\udc01'
246 test_lecmp(s, s2)
247 s2 = u'\uda00\udc01'
248 test_lecmp(s, s2)
249 s2 = u'\udb00\udc01'
250 test_lecmp(s, s2)
251 s2 = u'\ud800\udd01'
252 test_lecmp(s, s2)
253 s2 = u'\ud900\udd01'
254 test_lecmp(s, s2)
255 s2 = u'\uda00\udd01'
256 test_lecmp(s, s2)
257 s2 = u'\udb00\udd01'
258 test_lecmp(s, s2)
259 s2 = u'\ud800\ude01'
260 test_lecmp(s, s2)
261 s2 = u'\ud900\ude01'
262 test_lecmp(s, s2)
263 s2 = u'\uda00\ude01'
264 test_lecmp(s, s2)
265 s2 = u'\udb00\ude01'
266 test_lecmp(s, s2)
267 s2 = u'\ud800\udfff'
268 test_lecmp(s, s2)
269 s2 = u'\ud900\udfff'
270 test_lecmp(s, s2)
271 s2 = u'\uda00\udfff'
272 test_lecmp(s, s2)
273 s2 = u'\udb00\udfff'
274 test_lecmp(s, s2)
275
276 test_fixup(u'\ue000')
277 test_fixup(u'\uff61')
278
279 # Surrogates on both sides, no fixup required
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000280 self.assertTrue(u'\ud800\udc02' < u'\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000281
Ezio Melottiea7b6f62011-08-15 10:04:28 +0300282 def test_capitalize(self):
283 string_tests.CommonTest.test_capitalize(self)
284 # check that titlecased chars are lowered correctly
285 # \u1ffc is the titlecased char
286 self.checkequal(u'\u1ffc\u1ff3\u1ff3\u1ff3',
287 u'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
288 # check with cased non-letter chars
289 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
290 u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize')
291 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
292 u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize')
293 self.checkequal(u'\u2160\u2171\u2172',
294 u'\u2160\u2161\u2162', 'capitalize')
295 self.checkequal(u'\u2160\u2171\u2172',
296 u'\u2170\u2171\u2172', 'capitalize')
297 # check with Ll chars with no upper - nothing changes here
298 self.checkequal(u'\u019b\u1d00\u1d86\u0221\u1fb7',
299 u'\u019b\u1d00\u1d86\u0221\u1fb7', 'capitalize')
300
Walter Dörwald28256f22003-01-19 16:59:20 +0000301 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000302 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
303 self.checkequalnofix(False, u'\u1FFc', 'islower')
Walter Dörwald28256f22003-01-19 16:59:20 +0000304
Ezio Melotti12682b12011-08-22 23:46:30 +0300305 @requires_wide_build
306 def test_islower_non_bmp(self):
307 # non-BMP, uppercase
308 self.assertFalse(u'\U00010401'.islower())
309 self.assertFalse(u'\U00010427'.islower())
310 # non-BMP, lowercase
311 self.assertTrue(u'\U00010429'.islower())
312 self.assertTrue(u'\U0001044E'.islower())
313 # non-BMP, non-cased
314 self.assertFalse(u'\U0001F40D'.islower())
315 self.assertFalse(u'\U0001F46F'.islower())
316
Walter Dörwald28256f22003-01-19 16:59:20 +0000317 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000318 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
319 if not sys.platform.startswith('java'):
320 self.checkequalnofix(False, u'\u1FFc', 'isupper')
Walter Dörwald28256f22003-01-19 16:59:20 +0000321
Ezio Melotti12682b12011-08-22 23:46:30 +0300322 @requires_wide_build
323 def test_isupper_non_bmp(self):
324 # non-BMP, uppercase
325 self.assertTrue(u'\U00010401'.isupper())
326 self.assertTrue(u'\U00010427'.isupper())
327 # non-BMP, lowercase
328 self.assertFalse(u'\U00010429'.isupper())
329 self.assertFalse(u'\U0001044E'.isupper())
330 # non-BMP, non-cased
331 self.assertFalse(u'\U0001F40D'.isupper())
332 self.assertFalse(u'\U0001F46F'.isupper())
333
Walter Dörwald28256f22003-01-19 16:59:20 +0000334 def test_istitle(self):
Ezio Melotti12682b12011-08-22 23:46:30 +0300335 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000336 self.checkequalnofix(True, u'\u1FFc', 'istitle')
337 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000338
Ezio Melotti12682b12011-08-22 23:46:30 +0300339 @requires_wide_build
340 def test_istitle_non_bmp(self):
341 # non-BMP, uppercase + lowercase
342 self.assertTrue(u'\U00010401\U00010429'.istitle())
343 self.assertTrue(u'\U00010427\U0001044E'.istitle())
344 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
345 for ch in [u'\U00010429', u'\U0001044E', u'\U0001F40D', u'\U0001F46F']:
346 self.assertFalse(ch.istitle(), '{!r} is not title'.format(ch))
347
Walter Dörwald28256f22003-01-19 16:59:20 +0000348 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000349 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
350 self.checkequalnofix(True, u'\u2000', 'isspace')
351 self.checkequalnofix(True, u'\u200a', 'isspace')
352 self.checkequalnofix(False, u'\u2014', 'isspace')
Walter Dörwald28256f22003-01-19 16:59:20 +0000353
Ezio Melotti12682b12011-08-22 23:46:30 +0300354 @requires_wide_build
355 def test_isspace_non_bmp(self):
356 # apparently there are no non-BMP spaces chars in Unicode 6
357 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
358 u'\U0001F40D', u'\U0001F46F']:
359 self.assertFalse(ch.isspace(), '{!r} is not space.'.format(ch))
360
361 @requires_wide_build
362 def test_isalnum_non_bmp(self):
363 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
364 u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
365 self.assertTrue(ch.isalnum(), '{!r} is alnum.'.format(ch))
366
Walter Dörwald28256f22003-01-19 16:59:20 +0000367 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000368 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
369 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
Walter Dörwald28256f22003-01-19 16:59:20 +0000370
Ezio Melotti12682b12011-08-22 23:46:30 +0300371 @requires_wide_build
372 def test_isalpha_non_bmp(self):
373 # non-BMP, cased
374 self.assertTrue(u'\U00010401'.isalpha())
375 self.assertTrue(u'\U00010427'.isalpha())
376 self.assertTrue(u'\U00010429'.isalpha())
377 self.assertTrue(u'\U0001044E'.isalpha())
378 # non-BMP, non-cased
379 self.assertFalse(u'\U0001F40D'.isalpha())
380 self.assertFalse(u'\U0001F46F'.isalpha())
381
Walter Dörwald28256f22003-01-19 16:59:20 +0000382 def test_isdecimal(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000383 self.checkequalnofix(False, u'', 'isdecimal')
384 self.checkequalnofix(False, u'a', 'isdecimal')
385 self.checkequalnofix(True, u'0', 'isdecimal')
386 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
387 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
388 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
389 self.checkequalnofix(True, u'0123456789', 'isdecimal')
390 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000391
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000392 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000393
Ezio Melotti12682b12011-08-22 23:46:30 +0300394 @requires_wide_build
395 def test_isdecimal_non_bmp(self):
396 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
397 u'\U0001F40D', u'\U0001F46F', u'\U00011065', u'\U0001F107']:
398 self.assertFalse(ch.isdecimal(), '{!r} is not decimal.'.format(ch))
399 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0']:
400 self.assertTrue(ch.isdecimal(), '{!r} is decimal.'.format(ch))
401
Walter Dörwald28256f22003-01-19 16:59:20 +0000402 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000403 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
404 self.checkequalnofix(True, u'\u2460', 'isdigit')
405 self.checkequalnofix(False, u'\xbc', 'isdigit')
406 self.checkequalnofix(True, u'\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000407
Ezio Melotti12682b12011-08-22 23:46:30 +0300408 @requires_wide_build
409 def test_isdigit_non_bmp(self):
410 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
411 u'\U0001F40D', u'\U0001F46F', u'\U00011065']:
412 self.assertFalse(ch.isdigit(), '{!r} is not a digit.'.format(ch))
413 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
414 self.assertTrue(ch.isdigit(), '{!r} is a digit.'.format(ch))
415
Walter Dörwald28256f22003-01-19 16:59:20 +0000416 def test_isnumeric(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000417 self.checkequalnofix(False, u'', 'isnumeric')
418 self.checkequalnofix(False, u'a', 'isnumeric')
419 self.checkequalnofix(True, u'0', 'isnumeric')
420 self.checkequalnofix(True, u'\u2460', 'isnumeric')
421 self.checkequalnofix(True, u'\xbc', 'isnumeric')
422 self.checkequalnofix(True, u'\u0660', 'isnumeric')
423 self.checkequalnofix(True, u'0123456789', 'isnumeric')
424 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000425
426 self.assertRaises(TypeError, u"abc".isnumeric, 42)
427
Ezio Melotti12682b12011-08-22 23:46:30 +0300428 @requires_wide_build
429 def test_isnumeric_non_bmp(self):
430 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
431 u'\U0001F40D', u'\U0001F46F']:
432 self.assertFalse(ch.isnumeric(), '{!r} is not numeric.'.format(ch))
433 for ch in [u'\U00010107', u'\U0001D7F6', u'\U00023b1b',
434 u'\U000104A0', u'\U0001F107']:
435 self.assertTrue(ch.isnumeric(), '{!r} is numeric.'.format(ch))
436
437 @requires_wide_build
438 def test_surrogates(self):
439 # this test actually passes on narrow too, but it's just by accident.
440 # Surrogates are seen as non-cased chars, so u'X\uD800X' is as
441 # uppercase as 'X X'
442 for s in (u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
443 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
444 self.assertTrue(s.islower())
445 self.assertFalse(s.isupper())
446 self.assertFalse(s.istitle())
447 for s in (u'A\uD800B\uDFFF', u'A\uDFFFB\uD800',
448 u'A\uD800B\uDFFFA', u'A\uDFFFB\uD800A'):
449 self.assertFalse(s.islower())
450 self.assertTrue(s.isupper())
451 self.assertTrue(s.istitle())
452
453 for meth_name in ('islower', 'isupper', 'istitle'):
454 meth = getattr(unicode, meth_name)
455 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF'):
456 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
457
458 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
459 'isdecimal', 'isnumeric'):
460 meth = getattr(unicode, meth_name)
461 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF',
462 u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
463 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
464 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
465
466
467 @requires_wide_build
468 def test_lower(self):
469 string_tests.CommonTest.test_lower(self)
470 self.assertEqual(u'\U00010427'.lower(), u'\U0001044F')
471 self.assertEqual(u'\U00010427\U00010427'.lower(),
472 u'\U0001044F\U0001044F')
473 self.assertEqual(u'\U00010427\U0001044F'.lower(),
474 u'\U0001044F\U0001044F')
475 self.assertEqual(u'X\U00010427x\U0001044F'.lower(),
476 u'x\U0001044Fx\U0001044F')
477
478 @requires_wide_build
479 def test_upper(self):
480 string_tests.CommonTest.test_upper(self)
481 self.assertEqual(u'\U0001044F'.upper(), u'\U00010427')
482 self.assertEqual(u'\U0001044F\U0001044F'.upper(),
483 u'\U00010427\U00010427')
484 self.assertEqual(u'\U00010427\U0001044F'.upper(),
485 u'\U00010427\U00010427')
486 self.assertEqual(u'X\U00010427x\U0001044F'.upper(),
487 u'X\U00010427X\U00010427')
488
489 @requires_wide_build
Berker Peksagdfdae022014-11-24 23:57:00 +0200490 def test_capitalize_wide_build(self):
Ezio Melotti12682b12011-08-22 23:46:30 +0300491 string_tests.CommonTest.test_capitalize(self)
492 self.assertEqual(u'\U0001044F'.capitalize(), u'\U00010427')
493 self.assertEqual(u'\U0001044F\U0001044F'.capitalize(),
494 u'\U00010427\U0001044F')
495 self.assertEqual(u'\U00010427\U0001044F'.capitalize(),
496 u'\U00010427\U0001044F')
497 self.assertEqual(u'\U0001044F\U00010427'.capitalize(),
498 u'\U00010427\U0001044F')
499 self.assertEqual(u'X\U00010427x\U0001044F'.capitalize(),
500 u'X\U0001044Fx\U0001044F')
501
502 @requires_wide_build
503 def test_title(self):
504 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
505 self.assertEqual(u'\U0001044F'.title(), u'\U00010427')
506 self.assertEqual(u'\U0001044F\U0001044F'.title(),
507 u'\U00010427\U0001044F')
508 self.assertEqual(u'\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
509 u'\U00010427\U0001044F \U00010427\U0001044F')
510 self.assertEqual(u'\U00010427\U0001044F \U00010427\U0001044F'.title(),
511 u'\U00010427\U0001044F \U00010427\U0001044F')
512 self.assertEqual(u'\U0001044F\U00010427 \U0001044F\U00010427'.title(),
513 u'\U00010427\U0001044F \U00010427\U0001044F')
514 self.assertEqual(u'X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
515 u'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
516
517 @requires_wide_build
518 def test_swapcase(self):
519 string_tests.CommonTest.test_swapcase(self)
520 self.assertEqual(u'\U0001044F'.swapcase(), u'\U00010427')
521 self.assertEqual(u'\U00010427'.swapcase(), u'\U0001044F')
522 self.assertEqual(u'\U0001044F\U0001044F'.swapcase(),
523 u'\U00010427\U00010427')
524 self.assertEqual(u'\U00010427\U0001044F'.swapcase(),
525 u'\U0001044F\U00010427')
526 self.assertEqual(u'\U0001044F\U00010427'.swapcase(),
527 u'\U00010427\U0001044F')
528 self.assertEqual(u'X\U00010427x\U0001044F'.swapcase(),
529 u'x\U0001044FX\U00010427')
530
Walter Dörwald28256f22003-01-19 16:59:20 +0000531 def test_contains(self):
532 # Testing Unicode contains method
Ezio Melottiaa980582010-01-23 23:04:36 +0000533 self.assertIn('a', u'abdb')
534 self.assertIn('a', u'bdab')
535 self.assertIn('a', u'bdaba')
536 self.assertIn('a', u'bdba')
537 self.assertIn('a', u'bdba')
538 self.assertIn(u'a', u'bdba')
539 self.assertNotIn(u'a', u'bdb')
540 self.assertNotIn(u'a', 'bdb')
541 self.assertIn(u'a', 'bdba')
542 self.assertIn(u'a', ('a',1,None))
543 self.assertIn(u'a', (1,None,'a'))
544 self.assertIn(u'a', (1,None,u'a'))
545 self.assertIn('a', ('a',1,None))
546 self.assertIn('a', (1,None,'a'))
547 self.assertIn('a', (1,None,u'a'))
548 self.assertNotIn('a', ('x',1,u'y'))
549 self.assertNotIn('a', ('x',1,None))
550 self.assertNotIn(u'abcd', u'abcxxxx')
551 self.assertIn(u'ab', u'abcd')
552 self.assertIn('ab', u'abc')
553 self.assertIn(u'ab', 'abc')
554 self.assertIn(u'ab', (1,None,u'ab'))
555 self.assertIn(u'', u'abc')
556 self.assertIn('', u'abc')
Walter Dörwald28256f22003-01-19 16:59:20 +0000557
558 # If the following fails either
559 # the contains operator does not propagate UnicodeErrors or
560 # someone has changed the default encoding
R. David Murray0a0a1a82009-12-14 16:28:26 +0000561 self.assertRaises(UnicodeDecodeError, 'g\xe2teau'.__contains__, u'\xe2')
562 self.assertRaises(UnicodeDecodeError, u'g\xe2teau'.__contains__, '\xe2')
Walter Dörwald28256f22003-01-19 16:59:20 +0000563
Ezio Melottiaa980582010-01-23 23:04:36 +0000564 self.assertIn(u'', '')
565 self.assertIn('', u'')
566 self.assertIn(u'', u'')
567 self.assertIn(u'', 'abc')
568 self.assertIn('', u'abc')
569 self.assertIn(u'', u'abc')
570 self.assertNotIn(u'\0', 'abc')
571 self.assertNotIn('\0', u'abc')
572 self.assertNotIn(u'\0', u'abc')
573 self.assertIn(u'\0', '\0abc')
574 self.assertIn('\0', u'\0abc')
575 self.assertIn(u'\0', u'\0abc')
576 self.assertIn(u'\0', 'abc\0')
577 self.assertIn('\0', u'abc\0')
578 self.assertIn(u'\0', u'abc\0')
579 self.assertIn(u'a', '\0abc')
580 self.assertIn('a', u'\0abc')
581 self.assertIn(u'a', u'\0abc')
582 self.assertIn(u'asdf', 'asdf')
583 self.assertIn('asdf', u'asdf')
584 self.assertIn(u'asdf', u'asdf')
585 self.assertNotIn(u'asdf', 'asd')
586 self.assertNotIn('asdf', u'asd')
587 self.assertNotIn(u'asdf', u'asd')
588 self.assertNotIn(u'asdf', '')
589 self.assertNotIn('asdf', u'')
590 self.assertNotIn(u'asdf', u'')
Walter Dörwald28256f22003-01-19 16:59:20 +0000591
592 self.assertRaises(TypeError, u"abc".__contains__)
R. David Murray0a0a1a82009-12-14 16:28:26 +0000593 self.assertRaises(TypeError, u"abc".__contains__, object())
Walter Dörwald28256f22003-01-19 16:59:20 +0000594
595 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000596 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000597 # Testing Unicode formatting strings...
598 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
599 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
600 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
601 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
602 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
603 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000604 if not sys.platform.startswith('java'):
605 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
606 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000607 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000608
Walter Dörwald43440a62003-03-31 18:07:50 +0000609 self.assertEqual(u'%c' % 0x1234, u'\u1234')
Walter Dörwald44f527f2003-04-02 16:37:24 +0000610 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
Stefan Krah0b9201f2010-07-19 18:06:46 +0000611 self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3))
Walter Dörwald28256f22003-01-19 16:59:20 +0000612
Victor Stinnerf20f9c22010-02-23 23:16:07 +0000613 for num in range(0x00,0x80):
614 char = chr(num)
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000615 self.assertEqual(u"%c" % char, unicode(char))
616 self.assertEqual(u"%c" % num, unicode(char))
617 self.assertTrue(char == u"%c" % char)
618 self.assertTrue(char == u"%c" % num)
Victor Stinnerf20f9c22010-02-23 23:16:07 +0000619 # Issue 7649
620 for num in range(0x80,0x100):
621 uchar = unichr(num)
622 self.assertEqual(uchar, u"%c" % num) # works only with ints
623 self.assertEqual(uchar, u"%c" % uchar) # and unicode chars
624 # the implicit decoding should fail for non-ascii chars
625 self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num))
626 self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num))
627
Walter Dörwald28256f22003-01-19 16:59:20 +0000628 # formatting jobs delegated from the string implementation:
629 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
630 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
631 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
632 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
633 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
634 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
635 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
636 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
637 self.assertEqual('...%s...' % u"abc", u'...abc...')
638 self.assertEqual('%*s' % (5,u'abc',), u' abc')
639 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
640 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
641 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
642 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000643 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
Walter Dörwald43440a62003-03-31 18:07:50 +0000644 self.assertEqual('%c' % u'a', u'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000645 class Wrapper:
646 def __str__(self):
647 return u'\u1234'
648 self.assertEqual('%s' % Wrapper(), u'\u1234')
Tim Peters4511a712006-05-03 04:46:14 +0000649
Mark Dickinson75d36002012-10-28 10:00:46 +0000650 def test_formatting_huge_precision(self):
Serhiy Storchaka76249ea2014-02-07 10:06:05 +0200651 format_string = u"%.{}f".format(sys.maxsize + 1)
652 with self.assertRaises(ValueError):
653 result = format_string % 2.34
654
655 @test_support.cpython_only
656 def test_formatting_huge_precision_c_limits(self):
Mark Dickinson75d36002012-10-28 10:00:46 +0000657 from _testcapi import INT_MAX
658 format_string = u"%.{}f".format(INT_MAX + 1)
659 with self.assertRaises(ValueError):
660 result = format_string % 2.34
661
662 def test_formatting_huge_width(self):
663 format_string = u"%{}f".format(sys.maxsize + 1)
664 with self.assertRaises(ValueError):
665 result = format_string % 2.34
666
Ezio Melottie3685f62011-04-26 05:12:51 +0300667 def test_startswith_endswith_errors(self):
668 for meth in (u'foo'.startswith, u'foo'.endswith):
669 with self.assertRaises(UnicodeDecodeError):
670 meth('\xff')
671 with self.assertRaises(TypeError) as cm:
672 meth(['f'])
673 exc = str(cm.exception)
674 self.assertIn('unicode', exc)
675 self.assertIn('str', exc)
676 self.assertIn('tuple', exc)
677
Georg Brandlde9b6242006-04-30 11:13:56 +0000678 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000679 def test_format_float(self):
Georg Brandlde9b6242006-04-30 11:13:56 +0000680 # should not format with a comma, but always with C locale
681 self.assertEqual(u'1.0', u'%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +0000682
Walter Dörwald28256f22003-01-19 16:59:20 +0000683 def test_constructor(self):
684 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
685
686 self.assertEqual(
687 unicode(u'unicode remains unicode'),
688 u'unicode remains unicode'
689 )
690
Walter Dörwald28256f22003-01-19 16:59:20 +0000691 self.assertEqual(
692 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
693 u'unicode subclass becomes unicode'
694 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000695
Walter Dörwald28256f22003-01-19 16:59:20 +0000696 self.assertEqual(
697 unicode('strings are converted to unicode'),
698 u'strings are converted to unicode'
699 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000700
Walter Dörwald28256f22003-01-19 16:59:20 +0000701 class UnicodeCompat:
702 def __init__(self, x):
703 self.x = x
704 def __unicode__(self):
705 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000706
Walter Dörwald28256f22003-01-19 16:59:20 +0000707 self.assertEqual(
708 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
709 u'__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000710
Walter Dörwald28256f22003-01-19 16:59:20 +0000711 class StringCompat:
712 def __init__(self, x):
713 self.x = x
714 def __str__(self):
715 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000716
Walter Dörwald28256f22003-01-19 16:59:20 +0000717 self.assertEqual(
718 unicode(StringCompat('__str__ compatible objects are recognized')),
719 u'__str__ compatible objects are recognized'
720 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000721
Walter Dörwald28256f22003-01-19 16:59:20 +0000722 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000723
Walter Dörwald28256f22003-01-19 16:59:20 +0000724 o = StringCompat('unicode(obj) is compatible to str()')
725 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
726 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000727
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000728 # %-formatting and .__unicode__()
729 self.assertEqual(u'%s' %
730 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
731 u"u'%s' % obj uses obj.__unicode__()")
732 self.assertEqual(u'%s' %
733 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
734 u"u'%s' % obj falls back to obj.__str__()")
735
Walter Dörwald28256f22003-01-19 16:59:20 +0000736 for obj in (123, 123.45, 123L):
737 self.assertEqual(unicode(obj), unicode(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000738
Walter Dörwald28256f22003-01-19 16:59:20 +0000739 # unicode(obj, encoding, error) tests (this maps to
740 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000741
Walter Dörwald28256f22003-01-19 16:59:20 +0000742 if not sys.platform.startswith('java'):
743 self.assertRaises(
744 TypeError,
745 unicode,
746 u'decoding unicode is not supported',
747 'utf-8',
748 'strict'
749 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000750
Walter Dörwald28256f22003-01-19 16:59:20 +0000751 self.assertEqual(
752 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
753 u'strings are decoded to unicode'
754 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000755
Walter Dörwald28256f22003-01-19 16:59:20 +0000756 if not sys.platform.startswith('java'):
Florent Xicluna6de9e932010-03-07 12:18:33 +0000757 with test_support.check_py3k_warnings():
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000758 buf = buffer('character buffers are decoded to unicode')
Walter Dörwald28256f22003-01-19 16:59:20 +0000759 self.assertEqual(
760 unicode(
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000761 buf,
Walter Dörwald28256f22003-01-19 16:59:20 +0000762 'utf-8',
763 'strict'
764 ),
765 u'character buffers are decoded to unicode'
766 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000767
Walter Dörwald28256f22003-01-19 16:59:20 +0000768 self.assertRaises(TypeError, unicode, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000769
Walter Dörwald28256f22003-01-19 16:59:20 +0000770 def test_codecs_utf7(self):
771 utfTests = [
772 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
773 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
774 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
775 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
776 (u'+', '+-'),
777 (u'+-', '+--'),
778 (u'+?', '+-?'),
779 (u'\?', '+AFw?'),
780 (u'+?', '+-?'),
781 (ur'\\?', '+AFwAXA?'),
782 (ur'\\\?', '+AFwAXABc?'),
Antoine Pitrou653dece2009-05-04 18:32:32 +0000783 (ur'++--', '+-+---'),
784 (u'\U000abcde', '+2m/c3g-'), # surrogate pairs
785 (u'/', '/'),
Walter Dörwald28256f22003-01-19 16:59:20 +0000786 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000787
Walter Dörwald28256f22003-01-19 16:59:20 +0000788 for (x, y) in utfTests:
789 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000790
Antoine Pitrou30402542011-11-15 01:49:40 +0100791 # Unpaired surrogates are passed through
792 self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-')
793 self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x')
794 self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-')
795 self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x')
796 self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801')
797 self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x')
798 self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01')
799 self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000800
Antoine Pitrou30402542011-11-15 01:49:40 +0100801 self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-')
802 self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000803
Antoine Pitrou653dece2009-05-04 18:32:32 +0000804 # Direct encoded characters
805 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
806 # Optional direct characters
807 set_o = '!"#$%&*;<=>@[]^_`{|}'
808 for c in set_d:
809 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000810 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
811 self.assertTrue(c == c.encode('ascii').decode('utf7'))
Antoine Pitrou653dece2009-05-04 18:32:32 +0000812 for c in set_o:
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000813 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
814 self.assertTrue(c == c.encode('ascii').decode('utf7'))
Antoine Pitrou4982d5d2008-07-25 17:45:59 +0000815
Walter Dörwald28256f22003-01-19 16:59:20 +0000816 def test_codecs_utf8(self):
817 self.assertEqual(u''.encode('utf-8'), '')
818 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
819 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
820 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
821 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
822 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
823 self.assertEqual(
824 (u'\ud800\udc02'*1000).encode('utf-8'),
825 '\xf0\x90\x80\x82'*1000
826 )
827 self.assertEqual(
828 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
829 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
830 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
831 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
832 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
833 u' Nunstuck git und'.encode('utf-8'),
834 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
835 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
836 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
837 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
838 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
839 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
840 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
841 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
842 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
843 '\xe3\x80\x8cWenn ist das Nunstuck git und'
844 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000845
Walter Dörwald28256f22003-01-19 16:59:20 +0000846 # UTF-8 specific decoding tests
Florent Xicluna9b90cd12010-09-13 07:46:37 +0000847 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456')
848 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002')
849 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000850
Walter Dörwald28256f22003-01-19 16:59:20 +0000851 # Other possible utf-8 test cases:
852 # * strict decoding testing for all of the
853 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000854
Ezio Melottie57e50c2010-06-05 17:51:07 +0000855 def test_utf8_decode_valid_sequences(self):
856 sequences = [
857 # single byte
858 ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
859 # 2 bytes
860 ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
861 # 3 bytes
862 ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
863 ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
864 # 4 bytes
865 ('\xF0\x90\x80\x80', u'\U00010000'),
866 ('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
867 ]
868 for seq, res in sequences:
869 self.assertEqual(seq.decode('utf-8'), res)
870
871 for ch in map(unichr, range(0, sys.maxunicode)):
872 self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
873
874 def test_utf8_decode_invalid_sequences(self):
875 # continuation bytes in a sequence of 2, 3, or 4 bytes
876 continuation_bytes = map(chr, range(0x80, 0xC0))
Serhiy Storchakae8c9e142015-01-18 11:42:50 +0200877 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
Ezio Melottie57e50c2010-06-05 17:51:07 +0000878 invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
Serhiy Storchakae8c9e142015-01-18 11:42:50 +0200879 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
Ezio Melottie57e50c2010-06-05 17:51:07 +0000880 invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
881 invalid_start_bytes = (
882 continuation_bytes + invalid_2B_seq_start_bytes +
883 invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
884 )
885
886 for byte in invalid_start_bytes:
887 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
888
889 for sb in invalid_2B_seq_start_bytes:
890 for cb in continuation_bytes:
891 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
892
893 for sb in invalid_4B_seq_start_bytes:
894 for cb1 in continuation_bytes[:3]:
895 for cb3 in continuation_bytes[:3]:
896 self.assertRaises(UnicodeDecodeError,
897 (sb+cb1+'\x80'+cb3).decode, 'utf-8')
898
899 for cb in map(chr, range(0x80, 0xA0)):
900 self.assertRaises(UnicodeDecodeError,
901 ('\xE0'+cb+'\x80').decode, 'utf-8')
902 self.assertRaises(UnicodeDecodeError,
903 ('\xE0'+cb+'\xBF').decode, 'utf-8')
904 # XXX: surrogates shouldn't be valid UTF-8!
905 # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
906 # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
907 #for cb in map(chr, range(0xA0, 0xC0)):
Ezio Melottie57e50c2010-06-05 17:51:07 +0000908 #self.assertRaises(UnicodeDecodeError,
909 #('\xED'+cb+'\x80').decode, 'utf-8')
910 #self.assertRaises(UnicodeDecodeError,
911 #('\xED'+cb+'\xBF').decode, 'utf-8')
Ezio Melotti370d85c2011-02-28 01:42:29 +0000912 # but since they are valid on Python 2 add a test for that:
913 for cb, surrogate in zip(map(chr, range(0xA0, 0xC0)),
914 map(unichr, range(0xd800, 0xe000, 64))):
915 encoded = '\xED'+cb+'\x80'
916 self.assertEqual(encoded.decode('utf-8'), surrogate)
917 self.assertEqual(surrogate.encode('utf-8'), encoded)
918
Ezio Melottie57e50c2010-06-05 17:51:07 +0000919 for cb in map(chr, range(0x80, 0x90)):
920 self.assertRaises(UnicodeDecodeError,
921 ('\xF0'+cb+'\x80\x80').decode, 'utf-8')
922 self.assertRaises(UnicodeDecodeError,
923 ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
924 for cb in map(chr, range(0x90, 0xC0)):
925 self.assertRaises(UnicodeDecodeError,
926 ('\xF4'+cb+'\x80\x80').decode, 'utf-8')
927 self.assertRaises(UnicodeDecodeError,
928 ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
929
930 def test_issue8271(self):
Ezio Melottiab2eb0e2010-06-05 19:21:32 +0000931 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
932 # only the start byte and the continuation byte(s) are now considered
933 # invalid, instead of the number of bytes specified by the start byte.
Ezio Melottie57e50c2010-06-05 17:51:07 +0000934 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
935 # table 3-8, Row 2) for more information about the algorithm used.
936 FFFD = u'\ufffd'
937 sequences = [
938 # invalid start bytes
939 ('\x80', FFFD), # continuation byte
940 ('\x80\x80', FFFD*2), # 2 continuation bytes
941 ('\xc0', FFFD),
942 ('\xc0\xc0', FFFD*2),
943 ('\xc1', FFFD),
944 ('\xc1\xc0', FFFD*2),
945 ('\xc0\xc1', FFFD*2),
946 # with start byte of a 2-byte sequence
947 ('\xc2', FFFD), # only the start byte
948 ('\xc2\xc2', FFFD*2), # 2 start bytes
949 ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
950 ('\xc2\x41', FFFD+'A'), # invalid continuation byte
951 # with start byte of a 3-byte sequence
952 ('\xe1', FFFD), # only the start byte
953 ('\xe1\xe1', FFFD*2), # 2 start bytes
954 ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
955 ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
956 ('\xe1\x80', FFFD), # only 1 continuation byte
957 ('\xe1\x41', FFFD+'A'), # invalid continuation byte
958 ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
959 ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
960 ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
961 ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
962 ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
963 # with start byte of a 4-byte sequence
964 ('\xf1', FFFD), # only the start byte
965 ('\xf1\xf1', FFFD*2), # 2 start bytes
966 ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
967 ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
968 ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
969 ('\xf1\x80', FFFD), # only 1 continuation bytes
970 ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
971 ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
972 ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
973 ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
974 ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
975 ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
976 ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
977 ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
978 ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
979 ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
980 ('\xf1\xf1\x80\x41', FFFD*2+'A'),
981 ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
982 # with invalid start byte of a 4-byte sequence (rfc2279)
983 ('\xf5', FFFD), # only the start byte
984 ('\xf5\xf5', FFFD*2), # 2 start bytes
985 ('\xf5\x80', FFFD*2), # only 1 continuation byte
986 ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
987 ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
988 ('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
989 ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
990 ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
991 # with invalid start byte of a 5-byte sequence (rfc2279)
992 ('\xf8', FFFD), # only the start byte
993 ('\xf8\xf8', FFFD*2), # 2 start bytes
994 ('\xf8\x80', FFFD*2), # only one continuation byte
995 ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
996 ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
997 # with invalid start byte of a 6-byte sequence (rfc2279)
998 ('\xfc', FFFD), # only the start byte
999 ('\xfc\xfc', FFFD*2), # 2 start bytes
1000 ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1001 ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1002 # invalid start byte
1003 ('\xfe', FFFD),
1004 ('\xfe\x80\x80', FFFD*3),
1005 # other sequences
1006 ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
1007 ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
1008 ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
1009 ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1010 u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1011 ]
1012 for n, (seq, res) in enumerate(sequences):
1013 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1014 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1015 self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
1016 self.assertEqual(seq.decode('utf-8', 'ignore'),
1017 res.replace(u'\uFFFD', ''))
1018
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001019 def test_codecs_idna(self):
1020 # Test whether trailing dot is preserved
1021 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
1022
Walter Dörwald28256f22003-01-19 16:59:20 +00001023 def test_codecs_errors(self):
1024 # Error handling (encoding)
1025 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
1026 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
1027 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
1028 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
Benjamin Peterson332d7212009-09-18 21:14:55 +00001029 self.assertEqual(u'Andr\202 x'.encode('ascii', 'replace'),
1030 u'Andr\202 x'.encode('ascii', errors='replace'))
1031 self.assertEqual(u'Andr\202 x'.encode('ascii', 'ignore'),
1032 u'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033
Walter Dörwald28256f22003-01-19 16:59:20 +00001034 # Error handling (decoding)
1035 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
1036 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
1037 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
1038 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001039 self.assertEqual(unicode('\202 x', 'ascii', 'replace'), u'\uFFFD x')
Serhiy Storchakae37003e2015-12-03 20:47:48 +02001040 with test_support.check_py3k_warnings():
1041 self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
1042 u'abcde'.decode('ascii', errors='ignore'))
1043 with test_support.check_py3k_warnings():
1044 self.assertEqual(u'abcde'.decode('ascii', 'replace'),
1045 u'abcde'.decode(encoding='ascii', errors='replace'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001046
Walter Dörwald28256f22003-01-19 16:59:20 +00001047 # Error handling (unknown character names)
1048 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001049
Walter Dörwald28256f22003-01-19 16:59:20 +00001050 # Error handling (truncated escape sequence)
1051 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001052
Walter Dörwald28256f22003-01-19 16:59:20 +00001053 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
1054 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
1055 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
1056 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
1057 # executes PyUnicode_Encode()
1058 import imp
1059 self.assertRaises(
1060 ImportError,
1061 imp.find_module,
1062 "non-existing module",
1063 [u"non-existing dir"]
1064 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001065
Walter Dörwald28256f22003-01-19 16:59:20 +00001066 # Error handling (wrong arguments)
1067 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001068
Walter Dörwald28256f22003-01-19 16:59:20 +00001069 # Error handling (PyUnicode_EncodeDecimal())
1070 self.assertRaises(UnicodeError, int, u"\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +00001071
Walter Dörwald28256f22003-01-19 16:59:20 +00001072 def test_codecs(self):
1073 # Encoding
1074 self.assertEqual(u'hello'.encode('ascii'), 'hello')
1075 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
1076 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
1077 self.assertEqual(u'hello'.encode('utf8'), 'hello')
1078 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
1079 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
1080 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00001081
Walter Dörwald28256f22003-01-19 16:59:20 +00001082 # Roundtrip safety for BMP (just the first 1024 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001083 for c in xrange(1024):
1084 u = unichr(c)
1085 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1086 'utf-16-be', 'raw_unicode_escape',
1087 'unicode_escape', 'unicode_internal'):
1088 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001089
Walter Dörwald28256f22003-01-19 16:59:20 +00001090 # Roundtrip safety for BMP (just the first 256 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001091 for c in xrange(256):
1092 u = unichr(c)
1093 for encoding in ('latin-1',):
1094 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001095
Walter Dörwald28256f22003-01-19 16:59:20 +00001096 # Roundtrip safety for BMP (just the first 128 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001097 for c in xrange(128):
1098 u = unichr(c)
1099 for encoding in ('ascii',):
1100 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001101
Walter Dörwald28256f22003-01-19 16:59:20 +00001102 # Roundtrip safety for non-BMP (just a few chars)
1103 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
1104 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1105 #'raw_unicode_escape',
1106 'unicode_escape', 'unicode_internal'):
1107 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001108
Walter Dörwald28256f22003-01-19 16:59:20 +00001109 # UTF-8 must be roundtrip safe for all UCS-2 code points
1110 # This excludes surrogates: in the full range, there would be
1111 # a surrogate pair (\udbff\udc00), which gets converted back
1112 # to a non-BMP character (\U0010fc00)
1113 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
1114 for encoding in ('utf-8',):
1115 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001116
Walter Dörwald28256f22003-01-19 16:59:20 +00001117 def test_codecs_charmap(self):
1118 # 0-127
1119 s = ''.join(map(chr, xrange(128)))
1120 for encoding in (
1121 'cp037', 'cp1026',
Georg Brandlf0757a22010-05-24 21:29:07 +00001122 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1123 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001124 'cp863', 'cp865', 'cp866',
1125 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1126 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1127 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1128 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001129
Walter Dörwald28256f22003-01-19 16:59:20 +00001130 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1131 'cp1256', 'cp1257', 'cp1258',
1132 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001133
Walter Dörwald28256f22003-01-19 16:59:20 +00001134 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1135 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001136
Walter Dörwald28256f22003-01-19 16:59:20 +00001137 ### These have undefined mappings:
1138 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001139
Walter Dörwald28256f22003-01-19 16:59:20 +00001140 ### These fail the round-trip:
1141 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00001142
Walter Dörwald28256f22003-01-19 16:59:20 +00001143 ):
1144 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001145
Walter Dörwald28256f22003-01-19 16:59:20 +00001146 # 128-255
1147 s = ''.join(map(chr, xrange(128, 256)))
1148 for encoding in (
1149 'cp037', 'cp1026',
Georg Brandlf0757a22010-05-24 21:29:07 +00001150 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1151 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001152 'cp863', 'cp865', 'cp866',
1153 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1154 'iso8859_2', 'iso8859_4', 'iso8859_5',
1155 'iso8859_9', 'koi8_r', 'latin_1',
1156 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00001157
Walter Dörwald28256f22003-01-19 16:59:20 +00001158 ### These have undefined mappings:
1159 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1160 #'cp1256', 'cp1257', 'cp1258',
1161 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1162 #'iso8859_3', 'iso8859_6', 'iso8859_7',
1163 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00001164
Walter Dörwald28256f22003-01-19 16:59:20 +00001165 ### These fail the round-trip:
1166 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00001167
Walter Dörwald28256f22003-01-19 16:59:20 +00001168 ):
1169 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001170
Walter Dörwald28256f22003-01-19 16:59:20 +00001171 def test_concatenation(self):
1172 self.assertEqual((u"abc" u"def"), u"abcdef")
1173 self.assertEqual(("abc" u"def"), u"abcdef")
1174 self.assertEqual((u"abc" "def"), u"abcdef")
1175 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
1176 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00001177
Walter Dörwald28256f22003-01-19 16:59:20 +00001178 def test_printing(self):
1179 class BitBucket:
1180 def write(self, text):
1181 pass
Fred Drake004d5e62000-10-23 17:22:08 +00001182
Walter Dörwald28256f22003-01-19 16:59:20 +00001183 out = BitBucket()
1184 print >>out, u'abc'
1185 print >>out, u'abc', u'def'
1186 print >>out, u'abc', 'def'
1187 print >>out, 'abc', u'def'
1188 print >>out, u'abc\n'
1189 print >>out, u'abc\n',
1190 print >>out, u'abc\n',
1191 print >>out, u'def\n'
1192 print >>out, u'def\n'
Fred Drake004d5e62000-10-23 17:22:08 +00001193
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001194 def test_ucs4(self):
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001195 x = u'\U00100000'
1196 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
1197 self.assertEqual(x, y)
1198
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00001199 y = r'\U00100000'
1200 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1201 self.assertEqual(x, y)
1202 y = r'\U00010000'
1203 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1204 self.assertEqual(x, y)
1205
1206 try:
1207 '\U11111111'.decode("raw-unicode-escape")
1208 except UnicodeDecodeError as e:
1209 self.assertEqual(e.start, 0)
1210 self.assertEqual(e.end, 10)
1211 else:
1212 self.fail("Should have raised UnicodeDecodeError")
1213
Brett Cannonc3647ac2005-04-26 03:45:26 +00001214 def test_conversion(self):
1215 # Make sure __unicode__() works properly
1216 class Foo0:
1217 def __str__(self):
1218 return "foo"
1219
1220 class Foo1:
1221 def __unicode__(self):
1222 return u"foo"
1223
1224 class Foo2(object):
1225 def __unicode__(self):
1226 return u"foo"
1227
1228 class Foo3(object):
1229 def __unicode__(self):
1230 return "foo"
1231
1232 class Foo4(str):
1233 def __unicode__(self):
1234 return "foo"
1235
1236 class Foo5(unicode):
1237 def __unicode__(self):
1238 return "foo"
1239
1240 class Foo6(str):
1241 def __str__(self):
1242 return "foos"
1243
1244 def __unicode__(self):
1245 return u"foou"
1246
1247 class Foo7(unicode):
1248 def __str__(self):
1249 return "foos"
1250 def __unicode__(self):
1251 return u"foou"
1252
1253 class Foo8(unicode):
1254 def __new__(cls, content=""):
1255 return unicode.__new__(cls, 2*content)
1256 def __unicode__(self):
1257 return self
1258
1259 class Foo9(unicode):
1260 def __str__(self):
1261 return "string"
1262 def __unicode__(self):
1263 return "not unicode"
1264
1265 self.assertEqual(unicode(Foo0()), u"foo")
1266 self.assertEqual(unicode(Foo1()), u"foo")
1267 self.assertEqual(unicode(Foo2()), u"foo")
1268 self.assertEqual(unicode(Foo3()), u"foo")
1269 self.assertEqual(unicode(Foo4("bar")), u"foo")
1270 self.assertEqual(unicode(Foo5("bar")), u"foo")
1271 self.assertEqual(unicode(Foo6("bar")), u"foou")
1272 self.assertEqual(unicode(Foo7("bar")), u"foou")
1273 self.assertEqual(unicode(Foo8("foo")), u"foofoo")
Serhiy Storchaka8d30ad72015-11-25 15:55:54 +02001274 self.assertIs(type(unicode(Foo8("foo"))), Foo8)
1275 self.assertEqual(UnicodeSubclass(Foo8("foo")), u"foofoo")
1276 self.assertIs(type(UnicodeSubclass(Foo8("foo"))), UnicodeSubclass)
Brett Cannonc3647ac2005-04-26 03:45:26 +00001277 self.assertEqual(str(Foo9("foo")), "string")
1278 self.assertEqual(unicode(Foo9("foo")), u"not unicode")
1279
Anthony Baxter67b6d512006-03-30 10:54:07 +00001280 def test_unicode_repr(self):
1281 class s1:
1282 def __repr__(self):
1283 return '\\n'
1284
1285 class s2:
1286 def __repr__(self):
1287 return u'\\n'
1288
1289 self.assertEqual(repr(s1()), '\\n')
1290 self.assertEqual(repr(s2()), '\\n')
1291
Zachary Ware1f702212013-12-10 14:09:20 -06001292 # This test only affects 32-bit platforms because expandtabs can only take
1293 # an int as the max value, not a 64-bit C long. If expandtabs is changed
1294 # to take a 64-bit long, this test should apply to all platforms.
1295 @unittest.skipIf(sys.maxint > (1 << 32) or struct.calcsize('P') != 4,
1296 'only applies to 32-bit platforms')
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00001297 def test_expandtabs_overflows_gracefully(self):
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00001298 self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
Anthony Baxter67b6d512006-03-30 10:54:07 +00001299
Eric Smitha9f7d622008-02-17 19:46:49 +00001300 def test__format__(self):
1301 def test(value, format, expected):
1302 # test both with and without the trailing 's'
1303 self.assertEqual(value.__format__(format), expected)
1304 self.assertEqual(value.__format__(format + u's'), expected)
1305
1306 test(u'', u'', u'')
1307 test(u'abc', u'', u'abc')
1308 test(u'abc', u'.3', u'abc')
1309 test(u'ab', u'.3', u'ab')
1310 test(u'abcdef', u'.3', u'abc')
1311 test(u'abcdef', u'.0', u'')
1312 test(u'abc', u'3.3', u'abc')
1313 test(u'abc', u'2.3', u'abc')
1314 test(u'abc', u'2.2', u'ab')
1315 test(u'abc', u'3.2', u'ab ')
1316 test(u'result', u'x<0', u'result')
1317 test(u'result', u'x<5', u'result')
1318 test(u'result', u'x<6', u'result')
1319 test(u'result', u'x<7', u'resultx')
1320 test(u'result', u'x<8', u'resultxx')
1321 test(u'result', u' <7', u'result ')
1322 test(u'result', u'<7', u'result ')
1323 test(u'result', u'>7', u' result')
1324 test(u'result', u'>8', u' result')
1325 test(u'result', u'^8', u' result ')
1326 test(u'result', u'^9', u' result ')
1327 test(u'result', u'^10', u' result ')
1328 test(u'a', u'10000', u'a' + u' ' * 9999)
1329 test(u'', u'10000', u' ' * 10000)
1330 test(u'', u'10000000', u' ' * 10000000)
1331
1332 # test mixing unicode and str
1333 self.assertEqual(u'abc'.__format__('s'), u'abc')
1334 self.assertEqual(u'abc'.__format__('->10s'), u'-------abc')
1335
1336 def test_format(self):
1337 self.assertEqual(u''.format(), u'')
1338 self.assertEqual(u'a'.format(), u'a')
1339 self.assertEqual(u'ab'.format(), u'ab')
1340 self.assertEqual(u'a{{'.format(), u'a{')
1341 self.assertEqual(u'a}}'.format(), u'a}')
1342 self.assertEqual(u'{{b'.format(), u'{b')
1343 self.assertEqual(u'}}b'.format(), u'}b')
1344 self.assertEqual(u'a{{b'.format(), u'a{b')
1345
1346 # examples from the PEP:
1347 import datetime
1348 self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred")
1349 self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')),
1350 u"My name is Fred")
1351 self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'),
1352 u"My name is Fred :-{}")
1353
1354 # datetime.__format__ doesn't work with unicode
1355 #d = datetime.date(2007, 8, 18)
1356 #self.assertEqual("The year is {0.year}".format(d),
1357 # "The year is 2007")
1358
1359 # classes we'll use for testing
1360 class C:
1361 def __init__(self, x=100):
1362 self._x = x
1363 def __format__(self, spec):
1364 return spec
1365
1366 class D:
1367 def __init__(self, x):
1368 self.x = x
1369 def __format__(self, spec):
1370 return str(self.x)
1371
1372 # class with __str__, but no __format__
1373 class E:
1374 def __init__(self, x):
1375 self.x = x
1376 def __str__(self):
1377 return u'E(' + self.x + u')'
1378
1379 # class with __repr__, but no __format__ or __str__
1380 class F:
1381 def __init__(self, x):
1382 self.x = x
1383 def __repr__(self):
1384 return u'F(' + self.x + u')'
1385
1386 # class with __format__ that forwards to string, for some format_spec's
1387 class G:
1388 def __init__(self, x):
1389 self.x = x
1390 def __str__(self):
1391 return u"string is " + self.x
1392 def __format__(self, format_spec):
1393 if format_spec == 'd':
1394 return u'G(' + self.x + u')'
1395 return object.__format__(self, format_spec)
1396
1397 # class that returns a bad type from __format__
1398 class H:
1399 def __format__(self, format_spec):
1400 return 1.0
1401
1402 class I(datetime.date):
1403 def __format__(self, format_spec):
1404 return self.strftime(format_spec)
1405
1406 class J(int):
1407 def __format__(self, format_spec):
1408 return int.__format__(self * 2, format_spec)
1409
1410
1411 self.assertEqual(u''.format(), u'')
1412 self.assertEqual(u'abc'.format(), u'abc')
1413 self.assertEqual(u'{0}'.format(u'abc'), u'abc')
1414 self.assertEqual(u'{0:}'.format(u'abc'), u'abc')
1415 self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc')
1416 self.assertEqual(u'{0}X'.format(u'abc'), u'abcX')
1417 self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY')
1418 self.assertEqual(u'{1}'.format(1, u'abc'), u'abc')
1419 self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc')
1420 self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX')
1421 self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY')
1422 self.assertEqual(u'{0}'.format(-15), u'-15')
1423 self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc')
1424 self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc')
1425 self.assertEqual(u'{{'.format(), u'{')
1426 self.assertEqual(u'}}'.format(), u'}')
1427 self.assertEqual(u'{{}}'.format(), u'{}')
1428 self.assertEqual(u'{{x}}'.format(), u'{x}')
1429 self.assertEqual(u'{{{0}}}'.format(123), u'{123}')
1430 self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}')
1431 self.assertEqual(u'}}{{'.format(), u'}{')
1432 self.assertEqual(u'}}x{{'.format(), u'}x{')
1433
1434 # weird field names
1435 self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz')
1436 self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz')
1437 self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3')
1438
1439 self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20')
1440 self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010')
1441 self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc')
1442 self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc')
1443 self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def')
1444 self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def')
1445 self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def')
1446
1447 # strings
1448 self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc')
1449 self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab')
1450 self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc')
1451 self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'')
1452 self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc')
1453 self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc')
1454 self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab')
1455 self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ')
1456 self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result')
1457 self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result')
1458 self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result')
1459 self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx')
1460 self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx')
1461 self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ')
1462 self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ')
1463 self.assertEqual(u'{0:>7s}'.format(u'result'), u' result')
1464 self.assertEqual(u'{0:>8s}'.format(u'result'), u' result')
1465 self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ')
1466 self.assertEqual(u'{0:^9s}'.format(u'result'), u' result ')
1467 self.assertEqual(u'{0:^10s}'.format(u'result'), u' result ')
1468 self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999)
1469 self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000)
1470 self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000)
1471
Eric V. Smith9a55cd82014-04-14 11:22:33 -04001472 # issue 12546: use \x00 as a fill character
1473 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1474 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1475 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1476 self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1477
1478 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1479 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1480 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1481 self.assertEqual('{0:<6}'.format(3), '3 ')
1482
1483 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1484 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1485 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1486 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1487
1488 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1489 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1490 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1491 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1492
Eric Smitha9f7d622008-02-17 19:46:49 +00001493 # format specifiers for user defined type
1494 self.assertEqual(u'{0:abc}'.format(C()), u'abc')
1495
Florent Xicluna60d512c2010-09-13 08:21:43 +00001496 # !r and !s coercions
Eric Smitha9f7d622008-02-17 19:46:49 +00001497 self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello')
1498 self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello')
1499 self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello ')
1500 self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello ')
1501 self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'")
1502 self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'")
1503 self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)')
1504
1505 # test fallback to object.__format__
1506 self.assertEqual(u'{0}'.format({}), u'{}')
1507 self.assertEqual(u'{0}'.format([]), u'[]')
1508 self.assertEqual(u'{0}'.format([1]), u'[1]')
1509 self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)')
Eric Smitha9f7d622008-02-17 19:46:49 +00001510 self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)')
Eric Smitha9f7d622008-02-17 19:46:49 +00001511 self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data')
1512
Florent Xicluna60d512c2010-09-13 08:21:43 +00001513 msg = 'object.__format__ with a non-empty format string is deprecated'
1514 with test_support.check_warnings((msg, PendingDeprecationWarning)):
1515 self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data) ')
1516 self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data) ')
1517 self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')
1518
Eric Smith2ace4cf2009-03-14 14:37:38 +00001519 self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007,
1520 month=8,
1521 day=27)),
1522 u"date: 2007-08-27")
Eric Smitha9f7d622008-02-17 19:46:49 +00001523
1524 # test deriving from a builtin type and overriding __format__
Eric Smith2ace4cf2009-03-14 14:37:38 +00001525 self.assertEqual(u"{0}".format(J(10)), u"20")
Eric Smitha9f7d622008-02-17 19:46:49 +00001526
1527
1528 # string format specifiers
Eric Smith2ace4cf2009-03-14 14:37:38 +00001529 self.assertEqual(u'{0:}'.format('a'), u'a')
Eric Smitha9f7d622008-02-17 19:46:49 +00001530
1531 # computed format specifiers
Eric Smith2ace4cf2009-03-14 14:37:38 +00001532 self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello')
1533 self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello')
1534 self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello')
1535 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello ')
1536 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello ')
Eric Smitha9f7d622008-02-17 19:46:49 +00001537
1538 # test various errors
Eric Smith2ace4cf2009-03-14 14:37:38 +00001539 self.assertRaises(ValueError, u'{'.format)
1540 self.assertRaises(ValueError, u'}'.format)
1541 self.assertRaises(ValueError, u'a{'.format)
1542 self.assertRaises(ValueError, u'a}'.format)
1543 self.assertRaises(ValueError, u'{a'.format)
1544 self.assertRaises(ValueError, u'}a'.format)
1545 self.assertRaises(IndexError, u'{0}'.format)
1546 self.assertRaises(IndexError, u'{1}'.format, u'abc')
1547 self.assertRaises(KeyError, u'{x}'.format)
1548 self.assertRaises(ValueError, u"}{".format)
1549 self.assertRaises(ValueError, u"{".format)
1550 self.assertRaises(ValueError, u"}".format)
1551 self.assertRaises(ValueError, u"abc{0:{}".format)
1552 self.assertRaises(ValueError, u"{0".format)
1553 self.assertRaises(IndexError, u"{0.}".format)
1554 self.assertRaises(ValueError, u"{0.}".format, 0)
1555 self.assertRaises(IndexError, u"{0[}".format)
1556 self.assertRaises(ValueError, u"{0[}".format, [])
1557 self.assertRaises(KeyError, u"{0]}".format)
1558 self.assertRaises(ValueError, u"{0.[]}".format, 0)
1559 self.assertRaises(ValueError, u"{0..foo}".format, 0)
1560 self.assertRaises(ValueError, u"{0[0}".format, 0)
1561 self.assertRaises(ValueError, u"{0[0:foo}".format, 0)
1562 self.assertRaises(KeyError, u"{c]}".format)
1563 self.assertRaises(ValueError, u"{{ {{{0}}".format, 0)
1564 self.assertRaises(ValueError, u"{0}}".format, 0)
1565 self.assertRaises(KeyError, u"{foo}".format, bar=3)
1566 self.assertRaises(ValueError, u"{0!x}".format, 3)
1567 self.assertRaises(ValueError, u"{0!}".format, 0)
1568 self.assertRaises(ValueError, u"{0!rs}".format, 0)
1569 self.assertRaises(ValueError, u"{!}".format)
1570 self.assertRaises(IndexError, u"{:}".format)
1571 self.assertRaises(IndexError, u"{:s}".format)
1572 self.assertRaises(IndexError, u"{}".format)
Benjamin Petersoneabdeba2010-06-07 22:33:09 +00001573 big = u"23098475029384702983476098230754973209482573"
1574 self.assertRaises(ValueError, (u"{" + big + u"}").format)
1575 self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0])
Eric Smitha9f7d622008-02-17 19:46:49 +00001576
Eric Smith4b94b192009-05-23 13:56:13 +00001577 # issue 6089
1578 self.assertRaises(ValueError, u"{0[0]x}".format, [None])
1579 self.assertRaises(ValueError, u"{0[0](10)}".format, [None])
1580
Eric Smitha9f7d622008-02-17 19:46:49 +00001581 # can't have a replacement on the field name portion
Eric Smith2ace4cf2009-03-14 14:37:38 +00001582 self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4)
Eric Smitha9f7d622008-02-17 19:46:49 +00001583
1584 # exceed maximum recursion depth
Eric Smith2ace4cf2009-03-14 14:37:38 +00001585 self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'')
1586 self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
Eric Smitha9f7d622008-02-17 19:46:49 +00001587 0, 1, 2, 3, 4, 5, 6, 7)
1588
1589 # string format spec errors
Eric Smith2ace4cf2009-03-14 14:37:38 +00001590 self.assertRaises(ValueError, u"{0:-s}".format, u'')
1591 self.assertRaises(ValueError, format, u"", u"-")
1592 self.assertRaises(ValueError, u"{0:=s}".format, u'')
Anthony Baxter67b6d512006-03-30 10:54:07 +00001593
Eric Smithbc32fee2008-02-18 18:02:34 +00001594 # test combining string and unicode
1595 self.assertEqual(u"foo{0}".format('bar'), u'foobar')
1596 # This will try to convert the argument from unicode to str, which
1597 # will succeed
1598 self.assertEqual("foo{0}".format(u'bar'), 'foobar')
1599 # This will try to convert the argument from unicode to str, which
1600 # will fail
1601 self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar')
1602
Mark Dickinson75d36002012-10-28 10:00:46 +00001603 def test_format_huge_precision(self):
1604 format_string = u".{}f".format(sys.maxsize + 1)
1605 with self.assertRaises(ValueError):
1606 result = format(2.34, format_string)
1607
1608 def test_format_huge_width(self):
1609 format_string = u"{}f".format(sys.maxsize + 1)
1610 with self.assertRaises(ValueError):
1611 result = format(2.34, format_string)
1612
1613 def test_format_huge_item_number(self):
1614 format_string = u"{{{}:.6f}}".format(sys.maxsize + 1)
1615 with self.assertRaises(ValueError):
1616 result = format_string.format(2.34)
1617
Eric Smith6f42edb2009-03-14 11:57:26 +00001618 def test_format_auto_numbering(self):
1619 class C:
1620 def __init__(self, x=100):
1621 self._x = x
1622 def __format__(self, spec):
1623 return spec
1624
1625 self.assertEqual(u'{}'.format(10), u'10')
1626 self.assertEqual(u'{:5}'.format('s'), u's ')
1627 self.assertEqual(u'{!r}'.format('s'), u"'s'")
1628 self.assertEqual(u'{._x}'.format(C(10)), u'10')
1629 self.assertEqual(u'{[1]}'.format([1, 2]), u'2')
1630 self.assertEqual(u'{[a]}'.format({'a':4, 'b':2}), u'4')
1631 self.assertEqual(u'a{}b{}c'.format(0, 1), u'a0b1c')
1632
1633 self.assertEqual(u'a{:{}}b'.format('x', '^10'), u'a x b')
1634 self.assertEqual(u'a{:{}x}b'.format(20, '#'), u'a0x14b')
1635
1636 # can't mix and match numbering and auto-numbering
1637 self.assertRaises(ValueError, u'{}{1}'.format, 1, 2)
1638 self.assertRaises(ValueError, u'{1}{}'.format, 1, 2)
1639 self.assertRaises(ValueError, u'{:{1}}'.format, 1, 2)
1640 self.assertRaises(ValueError, u'{0:{}}'.format, 1, 2)
1641
1642 # can mix and match auto-numbering and named
1643 self.assertEqual(u'{f}{}'.format(4, f='test'), u'test4')
1644 self.assertEqual(u'{}{f}'.format(4, f='test'), u'4test')
1645 self.assertEqual(u'{:{f}}{g}{}'.format(1, 3, g='g', f=2), u' 1g3')
1646 self.assertEqual(u'{f:{}}{}{g}'.format(2, 4, f=1, g='g'), u' 14g')
1647
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +00001648 def test_raiseMemError(self):
1649 # Ensure that the freelist contains a consistent object, even
1650 # when a string allocation fails with a MemoryError.
1651 # This used to crash the interpreter,
1652 # or leak references when the number was smaller.
Antoine Pitrou187ac1b2008-09-05 22:04:54 +00001653 charwidth = 4 if sys.maxunicode >= 0x10000 else 2
1654 # Note: sys.maxsize is half of the actual max allocation because of
1655 # the signedness of Py_ssize_t.
1656 alloc = lambda: u"a" * (sys.maxsize // charwidth * 2)
Antoine Pitroufd7c43e2008-08-17 17:01:49 +00001657 self.assertRaises(MemoryError, alloc)
1658 self.assertRaises(MemoryError, alloc)
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +00001659
Victor Stinner95affc42010-03-22 12:24:37 +00001660 def test_format_subclass(self):
1661 class U(unicode):
Victor Stinner95affc42010-03-22 12:24:37 +00001662 def __unicode__(self):
1663 return u'__unicode__ overridden'
1664 u = U(u'xxx')
Florent Xicluna9b90cd12010-09-13 07:46:37 +00001665 self.assertEqual("%s" % u, u'__unicode__ overridden')
Florent Xiclunac0c0b142010-09-13 08:53:00 +00001666 self.assertEqual("{}".format(u), '__unicode__ overridden')
Victor Stinner95affc42010-03-22 12:24:37 +00001667
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001668 # Test PyUnicode_FromFormat()
1669 def test_from_format(self):
1670 test_support.import_module('ctypes')
1671 from ctypes import (
1672 pythonapi, py_object, sizeof,
1673 c_int, c_long, c_longlong, c_ssize_t,
1674 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
1675 if sys.maxunicode == 0xffff:
1676 name = "PyUnicodeUCS2_FromFormat"
1677 else:
1678 name = "PyUnicodeUCS4_FromFormat"
1679 _PyUnicode_FromFormat = getattr(pythonapi, name)
1680 _PyUnicode_FromFormat.restype = py_object
1681
1682 def PyUnicode_FromFormat(format, *args):
1683 cargs = tuple(
1684 py_object(arg) if isinstance(arg, unicode) else arg
1685 for arg in args)
1686 return _PyUnicode_FromFormat(format, *cargs)
1687
1688 def check_format(expected, format, *args):
1689 text = PyUnicode_FromFormat(format, *args)
1690 self.assertEqual(expected, text)
1691
1692 # ascii format, non-ascii argument
1693 check_format(u'ascii\x7f=unicode\xe9',
1694 b'ascii\x7f=%U', u'unicode\xe9')
1695
1696 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
1697 # raises an error
1698 #self.assertRaisesRegex(ValueError,
1699 # '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
1700 # 'string, got a non-ASCII byte: 0xe9$',
1701 # PyUnicode_FromFormat, b'unicode\xe9=%s', u'ascii')
1702
1703 # test "%c"
1704 check_format(u'\uabcd',
1705 b'%c', c_int(0xabcd))
1706 if sys.maxunicode > 0xffff:
1707 check_format(u'\U0010ffff',
1708 b'%c', c_int(0x10ffff))
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +02001709 else:
1710 with self.assertRaises(OverflowError):
1711 PyUnicode_FromFormat(b'%c', c_int(0x10000))
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001712 with self.assertRaises(OverflowError):
1713 PyUnicode_FromFormat(b'%c', c_int(0x110000))
1714 # Issue #18183
1715 if sys.maxunicode > 0xffff:
1716 check_format(u'\U00010000\U00100000',
1717 b'%c%c', c_int(0x10000), c_int(0x100000))
1718
1719 # test "%"
1720 check_format(u'%',
1721 b'%')
1722 check_format(u'%',
1723 b'%%')
1724 check_format(u'%s',
1725 b'%%s')
1726 check_format(u'[%]',
1727 b'[%%]')
1728 check_format(u'%abc',
1729 b'%%%s', b'abc')
1730
1731 # test %S
1732 check_format(u"repr=abc",
1733 b'repr=%S', u'abc')
1734
1735 # test %R
1736 check_format(u"repr=u'abc'",
1737 b'repr=%R', u'abc')
1738
1739 # test integer formats (%i, %d, %u)
1740 check_format(u'010',
1741 b'%03i', c_int(10))
1742 check_format(u'0010',
1743 b'%0.4i', c_int(10))
1744 check_format(u'-123',
1745 b'%i', c_int(-123))
1746
1747 check_format(u'-123',
1748 b'%d', c_int(-123))
1749 check_format(u'-123',
1750 b'%ld', c_long(-123))
1751 check_format(u'-123',
1752 b'%zd', c_ssize_t(-123))
1753
1754 check_format(u'123',
1755 b'%u', c_uint(123))
1756 check_format(u'123',
1757 b'%lu', c_ulong(123))
1758 check_format(u'123',
1759 b'%zu', c_size_t(123))
1760
1761 # test long output
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +02001762 min_long = -(2 ** (8 * sizeof(c_long) - 1))
1763 max_long = -min_long - 1
1764 check_format(unicode(min_long),
1765 b'%ld', c_long(min_long))
1766 check_format(unicode(max_long),
1767 b'%ld', c_long(max_long))
1768 max_ulong = 2 ** (8 * sizeof(c_ulong)) - 1
1769 check_format(unicode(max_ulong),
1770 b'%lu', c_ulong(max_ulong))
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001771 PyUnicode_FromFormat(b'%p', c_void_p(-1))
1772
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +02001773 # test padding (width and/or precision)
1774 check_format(u'123'.rjust(10, u'0'),
1775 b'%010i', c_int(123))
1776 check_format(u'123'.rjust(100),
1777 b'%100i', c_int(123))
1778 check_format(u'123'.rjust(100, u'0'),
1779 b'%.100i', c_int(123))
1780 check_format(u'123'.rjust(80, u'0').rjust(100),
1781 b'%100.80i', c_int(123))
1782
1783 check_format(u'123'.rjust(10, u'0'),
1784 b'%010u', c_uint(123))
1785 check_format(u'123'.rjust(100),
1786 b'%100u', c_uint(123))
1787 check_format(u'123'.rjust(100, u'0'),
1788 b'%.100u', c_uint(123))
1789 check_format(u'123'.rjust(80, u'0').rjust(100),
1790 b'%100.80u', c_uint(123))
1791
1792 check_format(u'123'.rjust(10, u'0'),
1793 b'%010x', c_int(0x123))
1794 check_format(u'123'.rjust(100),
1795 b'%100x', c_int(0x123))
1796 check_format(u'123'.rjust(100, u'0'),
1797 b'%.100x', c_int(0x123))
1798 check_format(u'123'.rjust(80, u'0').rjust(100),
1799 b'%100.80x', c_int(0x123))
1800
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001801 # test %V
1802 check_format(u'repr=abc',
1803 b'repr=%V', u'abc', b'xyz')
1804 check_format(u'repr=\xe4\xba\xba\xe6\xb0\x91',
1805 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
1806 check_format(u'repr=abc\xff',
1807 b'repr=%V', None, b'abc\xff')
1808
1809 # not supported: copy the raw format string. these tests are just here
1810 # to check for crashs and should not be considered as specifications
1811 check_format(u'%s',
1812 b'%1%s', b'abc')
1813 check_format(u'%1abc',
1814 b'%1abc')
1815 check_format(u'%+i',
1816 b'%+i', c_int(10))
1817 check_format(u'%s',
1818 b'%.%s', b'abc')
1819
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001820 @test_support.cpython_only
Victor Stinner975134e2011-11-22 01:54:19 +01001821 def test_encode_decimal(self):
1822 from _testcapi import unicode_encodedecimal
1823 self.assertEqual(unicode_encodedecimal(u'123'),
1824 b'123')
1825 self.assertEqual(unicode_encodedecimal(u'\u0663.\u0661\u0664'),
1826 b'3.14')
1827 self.assertEqual(unicode_encodedecimal(u"\N{EM SPACE}3.14\N{EN SPACE}"),
1828 b' 3.14 ')
1829 self.assertRaises(UnicodeEncodeError,
1830 unicode_encodedecimal, u"123\u20ac", "strict")
1831 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "replace"),
1832 b'123?')
1833 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "ignore"),
1834 b'123')
1835 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "xmlcharrefreplace"),
1836 b'123&#8364;')
1837 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "backslashreplace"),
1838 b'123\\u20ac')
1839 self.assertEqual(unicode_encodedecimal(u"123\u20ac\N{EM SPACE}", "replace"),
1840 b'123? ')
1841 self.assertEqual(unicode_encodedecimal(u"123\u20ac\u20ac", "replace"),
1842 b'123??')
1843 self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"),
1844 b'123?0')
1845
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001846 @test_support.cpython_only
Serhiy Storchakae822b032013-08-06 16:56:26 +03001847 def test_encode_decimal_with_surrogates(self):
1848 from _testcapi import unicode_encodedecimal
1849 tests = [(u'\U0001f49d', '&#128157;'),
1850 (u'\ud83d', '&#55357;'),
1851 (u'\udc9d', '&#56477;'),
Serhiy Storchakae822b032013-08-06 16:56:26 +03001852 ]
Serhiy Storchaka1fdc7022013-10-31 17:06:03 +02001853 if u'\ud83d\udc9d' != u'\U0001f49d':
1854 tests += [(u'\ud83d\udc9d', '&#55357;&#56477;')]
Serhiy Storchakae822b032013-08-06 16:56:26 +03001855 for s, exp in tests:
1856 self.assertEqual(
1857 unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"),
1858 '123' + exp)
Victor Stinner95affc42010-03-22 12:24:37 +00001859
Walter Dörwald28256f22003-01-19 16:59:20 +00001860def test_main():
Collin Winterc2898c52007-04-25 17:29:52 +00001861 test_support.run_unittest(__name__)
Barry Warsaw817918c2002-08-06 16:58:21 +00001862
Walter Dörwald28256f22003-01-19 16:59:20 +00001863if __name__ == "__main__":
1864 test_main()