blob: be8f89be07ce4d70825af899b95932efabcb39be [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Ezio Melotti12682b12011-08-22 23:46:30 +03008import sys
9import struct
10import codecs
11import unittest
Walter Dörwald0fd583c2003-02-21 12:53:50 +000012from test import test_support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000013
Ezio Melotti12682b12011-08-22 23:46:30 +030014# decorator to skip tests on narrow builds
15requires_wide_build = unittest.skipIf(sys.maxunicode == 65535,
16 'requires wide build')
17
Neal Norwitz430f68b2005-11-24 22:00:56 +000018# Error handling (bad decoder return)
19def search_function(encoding):
20 def decode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 def decode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 if encoding=="test.unicode1":
29 return (encode1, decode1, None, None)
30 elif encoding=="test.unicode2":
31 return (encode2, decode2, None, None)
32 else:
33 return None
34codecs.register(search_function)
35
Walter Dörwald0fd583c2003-02-21 12:53:50 +000036class UnicodeTest(
37 string_tests.CommonTest,
Walter Dörwald57d88e52004-08-26 16:53:04 +000038 string_tests.MixinStrUnicodeUserStringTest,
39 string_tests.MixinStrUnicodeTest,
Walter Dörwald0fd583c2003-02-21 12:53:50 +000040 ):
41 type2test = unicode
42
Florent Xiclunac0c0b142010-09-13 08:53:00 +000043 def assertEqual(self, first, second, msg=None):
44 # strict assertEqual method: reject implicit bytes/unicode equality
45 super(UnicodeTest, self).assertEqual(first, second, msg)
46 if isinstance(first, unicode) or isinstance(second, unicode):
47 self.assertIsInstance(first, unicode)
48 self.assertIsInstance(second, unicode)
49 elif isinstance(first, str) or isinstance(second, str):
50 self.assertIsInstance(first, str)
51 self.assertIsInstance(second, str)
52
Walter Dörwald0fd583c2003-02-21 12:53:50 +000053 def checkequalnofix(self, result, object, methodname, *args):
54 method = getattr(object, methodname)
55 realresult = method(*args)
56 self.assertEqual(realresult, result)
Benjamin Peterson5c8da862009-06-30 22:57:08 +000057 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000058
59 # if the original is returned make sure that
60 # this doesn't happen with subclasses
61 if realresult is object:
62 class usub(unicode):
63 def __repr__(self):
64 return 'usub(%r)' % unicode.__repr__(self)
65 object = usub(object)
66 method = getattr(object, methodname)
67 realresult = method(*args)
68 self.assertEqual(realresult, result)
Benjamin Peterson5c8da862009-06-30 22:57:08 +000069 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000070
Jeremy Hylton504de6b2003-10-06 05:08:26 +000071 def test_literals(self):
72 self.assertEqual(u'\xff', u'\u00ff')
73 self.assertEqual(u'\uffff', u'\U0000ffff')
Kurt B. Kaiserdb98f362007-07-18 19:58:42 +000074 self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'')
75 self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'')
76 self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000)
Jeremy Hylton504de6b2003-10-06 05:08:26 +000077
Walter Dörwald28256f22003-01-19 16:59:20 +000078 def test_repr(self):
79 if not sys.platform.startswith('java'):
80 # Test basic sanity of repr()
81 self.assertEqual(repr(u'abc'), "u'abc'")
82 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
83 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
84 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
85 self.assertEqual(repr(u'\\'), "u'\\\\'")
86 self.assertEqual(repr(u'\n'), "u'\\n'")
87 self.assertEqual(repr(u'\r'), "u'\\r'")
88 self.assertEqual(repr(u'\t'), "u'\\t'")
89 self.assertEqual(repr(u'\b'), "u'\\x08'")
90 self.assertEqual(repr(u"'\""), """u'\\'"'""")
91 self.assertEqual(repr(u"'\""), """u'\\'"'""")
92 self.assertEqual(repr(u"'"), '''u"'"''')
93 self.assertEqual(repr(u'"'), """u'"'""")
94 latin1repr = (
95 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
96 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
97 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
98 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
99 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
100 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
101 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
102 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
103 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
104 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
105 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
106 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
107 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
108 "\\xfe\\xff'")
109 testrepr = repr(u''.join(map(unichr, xrange(256))))
110 self.assertEqual(testrepr, latin1repr)
Neal Norwitz17753ec2006-08-21 22:21:19 +0000111 # Test repr works on wide unicode escapes without overflow.
112 self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
113 repr(u"\U00010000" * 39 + u"\uffff" * 4096))
114
Walter Dörwald28256f22003-01-19 16:59:20 +0000115
Walter Dörwald28256f22003-01-19 16:59:20 +0000116 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000117 string_tests.CommonTest.test_count(self)
118 # check mixed argument types
119 self.checkequalnofix(3, 'aaa', 'count', u'a')
120 self.checkequalnofix(0, 'aaa', 'count', u'b')
121 self.checkequalnofix(3, u'aaa', 'count', 'a')
122 self.checkequalnofix(0, u'aaa', 'count', 'b')
123 self.checkequalnofix(0, u'aaa', 'count', 'b')
124 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
125 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
126 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
127 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000128
Walter Dörwald28256f22003-01-19 16:59:20 +0000129 def test_find(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000130 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
131 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
132 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000133
Walter Dörwald28256f22003-01-19 16:59:20 +0000134 self.assertRaises(TypeError, u'hello'.find)
135 self.assertRaises(TypeError, u'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000136
Walter Dörwald28256f22003-01-19 16:59:20 +0000137 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000138 string_tests.CommonTest.test_rfind(self)
139 # check mixed argument types
140 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
141 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
142 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000143
Walter Dörwald28256f22003-01-19 16:59:20 +0000144 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000145 string_tests.CommonTest.test_index(self)
146 # check mixed argument types
147 for (t1, t2) in ((str, unicode), (unicode, str)):
148 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
149 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
150 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
151 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
152 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
153 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
154 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
155 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000156
Walter Dörwald28256f22003-01-19 16:59:20 +0000157 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000158 string_tests.CommonTest.test_rindex(self)
159 # check mixed argument types
160 for (t1, t2) in ((str, unicode), (unicode, str)):
161 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
162 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
163 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
164 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000165
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000166 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
167 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
168 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
169 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
170 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000171
Walter Dörwald28256f22003-01-19 16:59:20 +0000172 def test_translate(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000173 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
174 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
175 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
176 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
177 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
Walter Dörwaldcd736e72004-02-05 17:36:00 +0000178 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000179
Walter Dörwald28256f22003-01-19 16:59:20 +0000180 self.assertRaises(TypeError, u'hello'.translate)
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000181 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000182
Walter Dörwald28256f22003-01-19 16:59:20 +0000183 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000184 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000185
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000186 # Mixed arguments
187 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
188 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
189 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000190
Walter Dörwald28256f22003-01-19 16:59:20 +0000191 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000192 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000193
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000194 # mixed arguments
195 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
196 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
197 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
198 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
199 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
200 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
201 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
Marc-André Lemburge5034372000-08-08 08:04:29 +0000202
Walter Dörwald28256f22003-01-19 16:59:20 +0000203 def test_strip(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000204 string_tests.CommonTest.test_strip(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000205 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000206
Walter Dörwald28256f22003-01-19 16:59:20 +0000207 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000208 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000209
Walter Dörwald28256f22003-01-19 16:59:20 +0000210 # method call forwarded from str implementation because of unicode argument
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000211 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
Walter Dörwald28256f22003-01-19 16:59:20 +0000212 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000213
Walter Dörwald28256f22003-01-19 16:59:20 +0000214 def test_comparison(self):
215 # Comparisons:
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000216 self.assertTrue(u'abc' == 'abc')
217 self.assertTrue('abc' == u'abc')
218 self.assertTrue(u'abc' == u'abc')
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000219 self.assertTrue(u'abcd' > 'abc')
220 self.assertTrue('abcd' > u'abc')
221 self.assertTrue(u'abcd' > u'abc')
222 self.assertTrue(u'abc' < 'abcd')
223 self.assertTrue('abc' < u'abcd')
224 self.assertTrue(u'abc' < u'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000225
226 if 0:
227 # Move these tests to a Unicode collation module test...
228 # Testing UTF-16 code point order comparisons...
229
230 # No surrogates, no fixup required.
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000231 self.assertTrue(u'\u0061' < u'\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000232 # Non surrogate below surrogate value, no fixup required
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000233 self.assertTrue(u'\u0061' < u'\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000234
235 # Non surrogate above surrogate value, fixup required
236 def test_lecmp(s, s2):
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000237 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000238
239 def test_fixup(s):
240 s2 = u'\ud800\udc01'
241 test_lecmp(s, s2)
242 s2 = u'\ud900\udc01'
243 test_lecmp(s, s2)
244 s2 = u'\uda00\udc01'
245 test_lecmp(s, s2)
246 s2 = u'\udb00\udc01'
247 test_lecmp(s, s2)
248 s2 = u'\ud800\udd01'
249 test_lecmp(s, s2)
250 s2 = u'\ud900\udd01'
251 test_lecmp(s, s2)
252 s2 = u'\uda00\udd01'
253 test_lecmp(s, s2)
254 s2 = u'\udb00\udd01'
255 test_lecmp(s, s2)
256 s2 = u'\ud800\ude01'
257 test_lecmp(s, s2)
258 s2 = u'\ud900\ude01'
259 test_lecmp(s, s2)
260 s2 = u'\uda00\ude01'
261 test_lecmp(s, s2)
262 s2 = u'\udb00\ude01'
263 test_lecmp(s, s2)
264 s2 = u'\ud800\udfff'
265 test_lecmp(s, s2)
266 s2 = u'\ud900\udfff'
267 test_lecmp(s, s2)
268 s2 = u'\uda00\udfff'
269 test_lecmp(s, s2)
270 s2 = u'\udb00\udfff'
271 test_lecmp(s, s2)
272
273 test_fixup(u'\ue000')
274 test_fixup(u'\uff61')
275
276 # Surrogates on both sides, no fixup required
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000277 self.assertTrue(u'\ud800\udc02' < u'\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000278
Ezio Melottiea7b6f62011-08-15 10:04:28 +0300279 def test_capitalize(self):
280 string_tests.CommonTest.test_capitalize(self)
281 # check that titlecased chars are lowered correctly
282 # \u1ffc is the titlecased char
283 self.checkequal(u'\u1ffc\u1ff3\u1ff3\u1ff3',
284 u'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
285 # check with cased non-letter chars
286 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
287 u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize')
288 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
289 u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize')
290 self.checkequal(u'\u2160\u2171\u2172',
291 u'\u2160\u2161\u2162', 'capitalize')
292 self.checkequal(u'\u2160\u2171\u2172',
293 u'\u2170\u2171\u2172', 'capitalize')
294 # check with Ll chars with no upper - nothing changes here
295 self.checkequal(u'\u019b\u1d00\u1d86\u0221\u1fb7',
296 u'\u019b\u1d00\u1d86\u0221\u1fb7', 'capitalize')
297
Walter Dörwald28256f22003-01-19 16:59:20 +0000298 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000299 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
300 self.checkequalnofix(False, u'\u1FFc', 'islower')
Walter Dörwald28256f22003-01-19 16:59:20 +0000301
Ezio Melotti12682b12011-08-22 23:46:30 +0300302 @requires_wide_build
303 def test_islower_non_bmp(self):
304 # non-BMP, uppercase
305 self.assertFalse(u'\U00010401'.islower())
306 self.assertFalse(u'\U00010427'.islower())
307 # non-BMP, lowercase
308 self.assertTrue(u'\U00010429'.islower())
309 self.assertTrue(u'\U0001044E'.islower())
310 # non-BMP, non-cased
311 self.assertFalse(u'\U0001F40D'.islower())
312 self.assertFalse(u'\U0001F46F'.islower())
313
Walter Dörwald28256f22003-01-19 16:59:20 +0000314 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000315 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
316 if not sys.platform.startswith('java'):
317 self.checkequalnofix(False, u'\u1FFc', 'isupper')
Walter Dörwald28256f22003-01-19 16:59:20 +0000318
Ezio Melotti12682b12011-08-22 23:46:30 +0300319 @requires_wide_build
320 def test_isupper_non_bmp(self):
321 # non-BMP, uppercase
322 self.assertTrue(u'\U00010401'.isupper())
323 self.assertTrue(u'\U00010427'.isupper())
324 # non-BMP, lowercase
325 self.assertFalse(u'\U00010429'.isupper())
326 self.assertFalse(u'\U0001044E'.isupper())
327 # non-BMP, non-cased
328 self.assertFalse(u'\U0001F40D'.isupper())
329 self.assertFalse(u'\U0001F46F'.isupper())
330
Walter Dörwald28256f22003-01-19 16:59:20 +0000331 def test_istitle(self):
Ezio Melotti12682b12011-08-22 23:46:30 +0300332 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000333 self.checkequalnofix(True, u'\u1FFc', 'istitle')
334 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000335
Ezio Melotti12682b12011-08-22 23:46:30 +0300336 @requires_wide_build
337 def test_istitle_non_bmp(self):
338 # non-BMP, uppercase + lowercase
339 self.assertTrue(u'\U00010401\U00010429'.istitle())
340 self.assertTrue(u'\U00010427\U0001044E'.istitle())
341 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
342 for ch in [u'\U00010429', u'\U0001044E', u'\U0001F40D', u'\U0001F46F']:
343 self.assertFalse(ch.istitle(), '{!r} is not title'.format(ch))
344
Walter Dörwald28256f22003-01-19 16:59:20 +0000345 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000346 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
347 self.checkequalnofix(True, u'\u2000', 'isspace')
348 self.checkequalnofix(True, u'\u200a', 'isspace')
349 self.checkequalnofix(False, u'\u2014', 'isspace')
Walter Dörwald28256f22003-01-19 16:59:20 +0000350
Ezio Melotti12682b12011-08-22 23:46:30 +0300351 @requires_wide_build
352 def test_isspace_non_bmp(self):
353 # apparently there are no non-BMP spaces chars in Unicode 6
354 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
355 u'\U0001F40D', u'\U0001F46F']:
356 self.assertFalse(ch.isspace(), '{!r} is not space.'.format(ch))
357
358 @requires_wide_build
359 def test_isalnum_non_bmp(self):
360 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
361 u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
362 self.assertTrue(ch.isalnum(), '{!r} is alnum.'.format(ch))
363
Walter Dörwald28256f22003-01-19 16:59:20 +0000364 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000365 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
366 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
Walter Dörwald28256f22003-01-19 16:59:20 +0000367
Ezio Melotti12682b12011-08-22 23:46:30 +0300368 @requires_wide_build
369 def test_isalpha_non_bmp(self):
370 # non-BMP, cased
371 self.assertTrue(u'\U00010401'.isalpha())
372 self.assertTrue(u'\U00010427'.isalpha())
373 self.assertTrue(u'\U00010429'.isalpha())
374 self.assertTrue(u'\U0001044E'.isalpha())
375 # non-BMP, non-cased
376 self.assertFalse(u'\U0001F40D'.isalpha())
377 self.assertFalse(u'\U0001F46F'.isalpha())
378
Walter Dörwald28256f22003-01-19 16:59:20 +0000379 def test_isdecimal(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000380 self.checkequalnofix(False, u'', 'isdecimal')
381 self.checkequalnofix(False, u'a', 'isdecimal')
382 self.checkequalnofix(True, u'0', 'isdecimal')
383 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
384 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
385 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
386 self.checkequalnofix(True, u'0123456789', 'isdecimal')
387 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000388
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000389 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000390
Ezio Melotti12682b12011-08-22 23:46:30 +0300391 @requires_wide_build
392 def test_isdecimal_non_bmp(self):
393 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
394 u'\U0001F40D', u'\U0001F46F', u'\U00011065', u'\U0001F107']:
395 self.assertFalse(ch.isdecimal(), '{!r} is not decimal.'.format(ch))
396 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0']:
397 self.assertTrue(ch.isdecimal(), '{!r} is decimal.'.format(ch))
398
Walter Dörwald28256f22003-01-19 16:59:20 +0000399 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000400 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
401 self.checkequalnofix(True, u'\u2460', 'isdigit')
402 self.checkequalnofix(False, u'\xbc', 'isdigit')
403 self.checkequalnofix(True, u'\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000404
Ezio Melotti12682b12011-08-22 23:46:30 +0300405 @requires_wide_build
406 def test_isdigit_non_bmp(self):
407 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
408 u'\U0001F40D', u'\U0001F46F', u'\U00011065']:
409 self.assertFalse(ch.isdigit(), '{!r} is not a digit.'.format(ch))
410 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
411 self.assertTrue(ch.isdigit(), '{!r} is a digit.'.format(ch))
412
Walter Dörwald28256f22003-01-19 16:59:20 +0000413 def test_isnumeric(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000414 self.checkequalnofix(False, u'', 'isnumeric')
415 self.checkequalnofix(False, u'a', 'isnumeric')
416 self.checkequalnofix(True, u'0', 'isnumeric')
417 self.checkequalnofix(True, u'\u2460', 'isnumeric')
418 self.checkequalnofix(True, u'\xbc', 'isnumeric')
419 self.checkequalnofix(True, u'\u0660', 'isnumeric')
420 self.checkequalnofix(True, u'0123456789', 'isnumeric')
421 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000422
423 self.assertRaises(TypeError, u"abc".isnumeric, 42)
424
Ezio Melotti12682b12011-08-22 23:46:30 +0300425 @requires_wide_build
426 def test_isnumeric_non_bmp(self):
427 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
428 u'\U0001F40D', u'\U0001F46F']:
429 self.assertFalse(ch.isnumeric(), '{!r} is not numeric.'.format(ch))
430 for ch in [u'\U00010107', u'\U0001D7F6', u'\U00023b1b',
431 u'\U000104A0', u'\U0001F107']:
432 self.assertTrue(ch.isnumeric(), '{!r} is numeric.'.format(ch))
433
434 @requires_wide_build
435 def test_surrogates(self):
436 # this test actually passes on narrow too, but it's just by accident.
437 # Surrogates are seen as non-cased chars, so u'X\uD800X' is as
438 # uppercase as 'X X'
439 for s in (u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
440 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
441 self.assertTrue(s.islower())
442 self.assertFalse(s.isupper())
443 self.assertFalse(s.istitle())
444 for s in (u'A\uD800B\uDFFF', u'A\uDFFFB\uD800',
445 u'A\uD800B\uDFFFA', u'A\uDFFFB\uD800A'):
446 self.assertFalse(s.islower())
447 self.assertTrue(s.isupper())
448 self.assertTrue(s.istitle())
449
450 for meth_name in ('islower', 'isupper', 'istitle'):
451 meth = getattr(unicode, meth_name)
452 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF'):
453 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
454
455 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
456 'isdecimal', 'isnumeric'):
457 meth = getattr(unicode, meth_name)
458 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF',
459 u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
460 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
461 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
462
463
464 @requires_wide_build
465 def test_lower(self):
466 string_tests.CommonTest.test_lower(self)
467 self.assertEqual(u'\U00010427'.lower(), u'\U0001044F')
468 self.assertEqual(u'\U00010427\U00010427'.lower(),
469 u'\U0001044F\U0001044F')
470 self.assertEqual(u'\U00010427\U0001044F'.lower(),
471 u'\U0001044F\U0001044F')
472 self.assertEqual(u'X\U00010427x\U0001044F'.lower(),
473 u'x\U0001044Fx\U0001044F')
474
475 @requires_wide_build
476 def test_upper(self):
477 string_tests.CommonTest.test_upper(self)
478 self.assertEqual(u'\U0001044F'.upper(), u'\U00010427')
479 self.assertEqual(u'\U0001044F\U0001044F'.upper(),
480 u'\U00010427\U00010427')
481 self.assertEqual(u'\U00010427\U0001044F'.upper(),
482 u'\U00010427\U00010427')
483 self.assertEqual(u'X\U00010427x\U0001044F'.upper(),
484 u'X\U00010427X\U00010427')
485
486 @requires_wide_build
Berker Peksagdfdae022014-11-24 23:57:00 +0200487 def test_capitalize_wide_build(self):
Ezio Melotti12682b12011-08-22 23:46:30 +0300488 string_tests.CommonTest.test_capitalize(self)
489 self.assertEqual(u'\U0001044F'.capitalize(), u'\U00010427')
490 self.assertEqual(u'\U0001044F\U0001044F'.capitalize(),
491 u'\U00010427\U0001044F')
492 self.assertEqual(u'\U00010427\U0001044F'.capitalize(),
493 u'\U00010427\U0001044F')
494 self.assertEqual(u'\U0001044F\U00010427'.capitalize(),
495 u'\U00010427\U0001044F')
496 self.assertEqual(u'X\U00010427x\U0001044F'.capitalize(),
497 u'X\U0001044Fx\U0001044F')
498
499 @requires_wide_build
500 def test_title(self):
501 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
502 self.assertEqual(u'\U0001044F'.title(), u'\U00010427')
503 self.assertEqual(u'\U0001044F\U0001044F'.title(),
504 u'\U00010427\U0001044F')
505 self.assertEqual(u'\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
506 u'\U00010427\U0001044F \U00010427\U0001044F')
507 self.assertEqual(u'\U00010427\U0001044F \U00010427\U0001044F'.title(),
508 u'\U00010427\U0001044F \U00010427\U0001044F')
509 self.assertEqual(u'\U0001044F\U00010427 \U0001044F\U00010427'.title(),
510 u'\U00010427\U0001044F \U00010427\U0001044F')
511 self.assertEqual(u'X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
512 u'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
513
514 @requires_wide_build
515 def test_swapcase(self):
516 string_tests.CommonTest.test_swapcase(self)
517 self.assertEqual(u'\U0001044F'.swapcase(), u'\U00010427')
518 self.assertEqual(u'\U00010427'.swapcase(), u'\U0001044F')
519 self.assertEqual(u'\U0001044F\U0001044F'.swapcase(),
520 u'\U00010427\U00010427')
521 self.assertEqual(u'\U00010427\U0001044F'.swapcase(),
522 u'\U0001044F\U00010427')
523 self.assertEqual(u'\U0001044F\U00010427'.swapcase(),
524 u'\U00010427\U0001044F')
525 self.assertEqual(u'X\U00010427x\U0001044F'.swapcase(),
526 u'x\U0001044FX\U00010427')
527
Walter Dörwald28256f22003-01-19 16:59:20 +0000528 def test_contains(self):
529 # Testing Unicode contains method
Ezio Melottiaa980582010-01-23 23:04:36 +0000530 self.assertIn('a', u'abdb')
531 self.assertIn('a', u'bdab')
532 self.assertIn('a', u'bdaba')
533 self.assertIn('a', u'bdba')
534 self.assertIn('a', u'bdba')
535 self.assertIn(u'a', u'bdba')
536 self.assertNotIn(u'a', u'bdb')
537 self.assertNotIn(u'a', 'bdb')
538 self.assertIn(u'a', 'bdba')
539 self.assertIn(u'a', ('a',1,None))
540 self.assertIn(u'a', (1,None,'a'))
541 self.assertIn(u'a', (1,None,u'a'))
542 self.assertIn('a', ('a',1,None))
543 self.assertIn('a', (1,None,'a'))
544 self.assertIn('a', (1,None,u'a'))
545 self.assertNotIn('a', ('x',1,u'y'))
546 self.assertNotIn('a', ('x',1,None))
547 self.assertNotIn(u'abcd', u'abcxxxx')
548 self.assertIn(u'ab', u'abcd')
549 self.assertIn('ab', u'abc')
550 self.assertIn(u'ab', 'abc')
551 self.assertIn(u'ab', (1,None,u'ab'))
552 self.assertIn(u'', u'abc')
553 self.assertIn('', u'abc')
Walter Dörwald28256f22003-01-19 16:59:20 +0000554
555 # If the following fails either
556 # the contains operator does not propagate UnicodeErrors or
557 # someone has changed the default encoding
R. David Murray0a0a1a82009-12-14 16:28:26 +0000558 self.assertRaises(UnicodeDecodeError, 'g\xe2teau'.__contains__, u'\xe2')
559 self.assertRaises(UnicodeDecodeError, u'g\xe2teau'.__contains__, '\xe2')
Walter Dörwald28256f22003-01-19 16:59:20 +0000560
Ezio Melottiaa980582010-01-23 23:04:36 +0000561 self.assertIn(u'', '')
562 self.assertIn('', u'')
563 self.assertIn(u'', u'')
564 self.assertIn(u'', 'abc')
565 self.assertIn('', u'abc')
566 self.assertIn(u'', u'abc')
567 self.assertNotIn(u'\0', 'abc')
568 self.assertNotIn('\0', u'abc')
569 self.assertNotIn(u'\0', u'abc')
570 self.assertIn(u'\0', '\0abc')
571 self.assertIn('\0', u'\0abc')
572 self.assertIn(u'\0', u'\0abc')
573 self.assertIn(u'\0', 'abc\0')
574 self.assertIn('\0', u'abc\0')
575 self.assertIn(u'\0', u'abc\0')
576 self.assertIn(u'a', '\0abc')
577 self.assertIn('a', u'\0abc')
578 self.assertIn(u'a', u'\0abc')
579 self.assertIn(u'asdf', 'asdf')
580 self.assertIn('asdf', u'asdf')
581 self.assertIn(u'asdf', u'asdf')
582 self.assertNotIn(u'asdf', 'asd')
583 self.assertNotIn('asdf', u'asd')
584 self.assertNotIn(u'asdf', u'asd')
585 self.assertNotIn(u'asdf', '')
586 self.assertNotIn('asdf', u'')
587 self.assertNotIn(u'asdf', u'')
Walter Dörwald28256f22003-01-19 16:59:20 +0000588
589 self.assertRaises(TypeError, u"abc".__contains__)
R. David Murray0a0a1a82009-12-14 16:28:26 +0000590 self.assertRaises(TypeError, u"abc".__contains__, object())
Walter Dörwald28256f22003-01-19 16:59:20 +0000591
592 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000593 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000594 # Testing Unicode formatting strings...
595 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
596 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
597 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
598 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
599 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
600 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000601 if not sys.platform.startswith('java'):
602 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
603 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000604 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000605
Walter Dörwald43440a62003-03-31 18:07:50 +0000606 self.assertEqual(u'%c' % 0x1234, u'\u1234')
Walter Dörwald44f527f2003-04-02 16:37:24 +0000607 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
Stefan Krah0b9201f2010-07-19 18:06:46 +0000608 self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3))
Walter Dörwald28256f22003-01-19 16:59:20 +0000609
Victor Stinnerf20f9c22010-02-23 23:16:07 +0000610 for num in range(0x00,0x80):
611 char = chr(num)
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000612 self.assertEqual(u"%c" % char, unicode(char))
613 self.assertEqual(u"%c" % num, unicode(char))
614 self.assertTrue(char == u"%c" % char)
615 self.assertTrue(char == u"%c" % num)
Victor Stinnerf20f9c22010-02-23 23:16:07 +0000616 # Issue 7649
617 for num in range(0x80,0x100):
618 uchar = unichr(num)
619 self.assertEqual(uchar, u"%c" % num) # works only with ints
620 self.assertEqual(uchar, u"%c" % uchar) # and unicode chars
621 # the implicit decoding should fail for non-ascii chars
622 self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num))
623 self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num))
624
Walter Dörwald28256f22003-01-19 16:59:20 +0000625 # formatting jobs delegated from the string implementation:
626 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
627 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
628 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
629 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
630 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
631 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
632 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
633 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
634 self.assertEqual('...%s...' % u"abc", u'...abc...')
635 self.assertEqual('%*s' % (5,u'abc',), u' abc')
636 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
637 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
638 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
639 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000640 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
Walter Dörwald43440a62003-03-31 18:07:50 +0000641 self.assertEqual('%c' % u'a', u'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000642 class Wrapper:
643 def __str__(self):
644 return u'\u1234'
645 self.assertEqual('%s' % Wrapper(), u'\u1234')
Tim Peters4511a712006-05-03 04:46:14 +0000646
Mark Dickinson75d36002012-10-28 10:00:46 +0000647 def test_formatting_huge_precision(self):
Serhiy Storchaka76249ea2014-02-07 10:06:05 +0200648 format_string = u"%.{}f".format(sys.maxsize + 1)
649 with self.assertRaises(ValueError):
650 result = format_string % 2.34
651
652 @test_support.cpython_only
653 def test_formatting_huge_precision_c_limits(self):
Mark Dickinson75d36002012-10-28 10:00:46 +0000654 from _testcapi import INT_MAX
655 format_string = u"%.{}f".format(INT_MAX + 1)
656 with self.assertRaises(ValueError):
657 result = format_string % 2.34
658
659 def test_formatting_huge_width(self):
660 format_string = u"%{}f".format(sys.maxsize + 1)
661 with self.assertRaises(ValueError):
662 result = format_string % 2.34
663
Ezio Melottie3685f62011-04-26 05:12:51 +0300664 def test_startswith_endswith_errors(self):
665 for meth in (u'foo'.startswith, u'foo'.endswith):
666 with self.assertRaises(UnicodeDecodeError):
667 meth('\xff')
668 with self.assertRaises(TypeError) as cm:
669 meth(['f'])
670 exc = str(cm.exception)
671 self.assertIn('unicode', exc)
672 self.assertIn('str', exc)
673 self.assertIn('tuple', exc)
674
Georg Brandlde9b6242006-04-30 11:13:56 +0000675 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000676 def test_format_float(self):
Georg Brandlde9b6242006-04-30 11:13:56 +0000677 # should not format with a comma, but always with C locale
678 self.assertEqual(u'1.0', u'%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +0000679
Walter Dörwald28256f22003-01-19 16:59:20 +0000680 def test_constructor(self):
681 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
682
683 self.assertEqual(
684 unicode(u'unicode remains unicode'),
685 u'unicode remains unicode'
686 )
687
688 class UnicodeSubclass(unicode):
Marc-André Lemburg79f57832002-12-29 19:44:06 +0000689 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +0000690
Walter Dörwald28256f22003-01-19 16:59:20 +0000691 self.assertEqual(
692 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
693 u'unicode subclass becomes unicode'
694 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000695
Walter Dörwald28256f22003-01-19 16:59:20 +0000696 self.assertEqual(
697 unicode('strings are converted to unicode'),
698 u'strings are converted to unicode'
699 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000700
Walter Dörwald28256f22003-01-19 16:59:20 +0000701 class UnicodeCompat:
702 def __init__(self, x):
703 self.x = x
704 def __unicode__(self):
705 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000706
Walter Dörwald28256f22003-01-19 16:59:20 +0000707 self.assertEqual(
708 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
709 u'__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000710
Walter Dörwald28256f22003-01-19 16:59:20 +0000711 class StringCompat:
712 def __init__(self, x):
713 self.x = x
714 def __str__(self):
715 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000716
Walter Dörwald28256f22003-01-19 16:59:20 +0000717 self.assertEqual(
718 unicode(StringCompat('__str__ compatible objects are recognized')),
719 u'__str__ compatible objects are recognized'
720 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000721
Walter Dörwald28256f22003-01-19 16:59:20 +0000722 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000723
Walter Dörwald28256f22003-01-19 16:59:20 +0000724 o = StringCompat('unicode(obj) is compatible to str()')
725 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
726 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000727
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000728 # %-formatting and .__unicode__()
729 self.assertEqual(u'%s' %
730 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
731 u"u'%s' % obj uses obj.__unicode__()")
732 self.assertEqual(u'%s' %
733 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
734 u"u'%s' % obj falls back to obj.__str__()")
735
Walter Dörwald28256f22003-01-19 16:59:20 +0000736 for obj in (123, 123.45, 123L):
737 self.assertEqual(unicode(obj), unicode(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000738
Walter Dörwald28256f22003-01-19 16:59:20 +0000739 # unicode(obj, encoding, error) tests (this maps to
740 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000741
Walter Dörwald28256f22003-01-19 16:59:20 +0000742 if not sys.platform.startswith('java'):
743 self.assertRaises(
744 TypeError,
745 unicode,
746 u'decoding unicode is not supported',
747 'utf-8',
748 'strict'
749 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000750
Walter Dörwald28256f22003-01-19 16:59:20 +0000751 self.assertEqual(
752 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
753 u'strings are decoded to unicode'
754 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000755
Walter Dörwald28256f22003-01-19 16:59:20 +0000756 if not sys.platform.startswith('java'):
Florent Xicluna6de9e932010-03-07 12:18:33 +0000757 with test_support.check_py3k_warnings():
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000758 buf = buffer('character buffers are decoded to unicode')
Walter Dörwald28256f22003-01-19 16:59:20 +0000759 self.assertEqual(
760 unicode(
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000761 buf,
Walter Dörwald28256f22003-01-19 16:59:20 +0000762 'utf-8',
763 'strict'
764 ),
765 u'character buffers are decoded to unicode'
766 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000767
Walter Dörwald28256f22003-01-19 16:59:20 +0000768 self.assertRaises(TypeError, unicode, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000769
Walter Dörwald28256f22003-01-19 16:59:20 +0000770 def test_codecs_utf7(self):
771 utfTests = [
772 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
773 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
774 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
775 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
776 (u'+', '+-'),
777 (u'+-', '+--'),
778 (u'+?', '+-?'),
779 (u'\?', '+AFw?'),
780 (u'+?', '+-?'),
781 (ur'\\?', '+AFwAXA?'),
782 (ur'\\\?', '+AFwAXABc?'),
Antoine Pitrou653dece2009-05-04 18:32:32 +0000783 (ur'++--', '+-+---'),
784 (u'\U000abcde', '+2m/c3g-'), # surrogate pairs
785 (u'/', '/'),
Walter Dörwald28256f22003-01-19 16:59:20 +0000786 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000787
Walter Dörwald28256f22003-01-19 16:59:20 +0000788 for (x, y) in utfTests:
789 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000790
Antoine Pitrou30402542011-11-15 01:49:40 +0100791 # Unpaired surrogates are passed through
792 self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-')
793 self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x')
794 self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-')
795 self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x')
796 self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801')
797 self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x')
798 self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01')
799 self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000800
Antoine Pitrou30402542011-11-15 01:49:40 +0100801 self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-')
802 self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000803
Antoine Pitrou653dece2009-05-04 18:32:32 +0000804 # Direct encoded characters
805 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
806 # Optional direct characters
807 set_o = '!"#$%&*;<=>@[]^_`{|}'
808 for c in set_d:
809 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000810 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
811 self.assertTrue(c == c.encode('ascii').decode('utf7'))
Antoine Pitrou653dece2009-05-04 18:32:32 +0000812 for c in set_o:
Florent Xiclunac0c0b142010-09-13 08:53:00 +0000813 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
814 self.assertTrue(c == c.encode('ascii').decode('utf7'))
Antoine Pitrou4982d5d2008-07-25 17:45:59 +0000815
Walter Dörwald28256f22003-01-19 16:59:20 +0000816 def test_codecs_utf8(self):
817 self.assertEqual(u''.encode('utf-8'), '')
818 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
819 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
820 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
821 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
822 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
823 self.assertEqual(
824 (u'\ud800\udc02'*1000).encode('utf-8'),
825 '\xf0\x90\x80\x82'*1000
826 )
827 self.assertEqual(
828 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
829 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
830 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
831 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
832 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
833 u' Nunstuck git und'.encode('utf-8'),
834 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
835 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
836 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
837 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
838 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
839 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
840 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
841 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
842 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
843 '\xe3\x80\x8cWenn ist das Nunstuck git und'
844 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000845
Walter Dörwald28256f22003-01-19 16:59:20 +0000846 # UTF-8 specific decoding tests
Florent Xicluna9b90cd12010-09-13 07:46:37 +0000847 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456')
848 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002')
849 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000850
Walter Dörwald28256f22003-01-19 16:59:20 +0000851 # Other possible utf-8 test cases:
852 # * strict decoding testing for all of the
853 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000854
Ezio Melottie57e50c2010-06-05 17:51:07 +0000855 def test_utf8_decode_valid_sequences(self):
856 sequences = [
857 # single byte
858 ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
859 # 2 bytes
860 ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
861 # 3 bytes
862 ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
863 ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
864 # 4 bytes
865 ('\xF0\x90\x80\x80', u'\U00010000'),
866 ('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
867 ]
868 for seq, res in sequences:
869 self.assertEqual(seq.decode('utf-8'), res)
870
871 for ch in map(unichr, range(0, sys.maxunicode)):
872 self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
873
874 def test_utf8_decode_invalid_sequences(self):
875 # continuation bytes in a sequence of 2, 3, or 4 bytes
876 continuation_bytes = map(chr, range(0x80, 0xC0))
Serhiy Storchakae8c9e142015-01-18 11:42:50 +0200877 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
Ezio Melottie57e50c2010-06-05 17:51:07 +0000878 invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
Serhiy Storchakae8c9e142015-01-18 11:42:50 +0200879 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
Ezio Melottie57e50c2010-06-05 17:51:07 +0000880 invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
881 invalid_start_bytes = (
882 continuation_bytes + invalid_2B_seq_start_bytes +
883 invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
884 )
885
886 for byte in invalid_start_bytes:
887 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
888
889 for sb in invalid_2B_seq_start_bytes:
890 for cb in continuation_bytes:
891 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
892
893 for sb in invalid_4B_seq_start_bytes:
894 for cb1 in continuation_bytes[:3]:
895 for cb3 in continuation_bytes[:3]:
896 self.assertRaises(UnicodeDecodeError,
897 (sb+cb1+'\x80'+cb3).decode, 'utf-8')
898
899 for cb in map(chr, range(0x80, 0xA0)):
900 self.assertRaises(UnicodeDecodeError,
901 ('\xE0'+cb+'\x80').decode, 'utf-8')
902 self.assertRaises(UnicodeDecodeError,
903 ('\xE0'+cb+'\xBF').decode, 'utf-8')
904 # XXX: surrogates shouldn't be valid UTF-8!
905 # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
906 # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
907 #for cb in map(chr, range(0xA0, 0xC0)):
Ezio Melottie57e50c2010-06-05 17:51:07 +0000908 #self.assertRaises(UnicodeDecodeError,
909 #('\xED'+cb+'\x80').decode, 'utf-8')
910 #self.assertRaises(UnicodeDecodeError,
911 #('\xED'+cb+'\xBF').decode, 'utf-8')
Ezio Melotti370d85c2011-02-28 01:42:29 +0000912 # but since they are valid on Python 2 add a test for that:
913 for cb, surrogate in zip(map(chr, range(0xA0, 0xC0)),
914 map(unichr, range(0xd800, 0xe000, 64))):
915 encoded = '\xED'+cb+'\x80'
916 self.assertEqual(encoded.decode('utf-8'), surrogate)
917 self.assertEqual(surrogate.encode('utf-8'), encoded)
918
Ezio Melottie57e50c2010-06-05 17:51:07 +0000919 for cb in map(chr, range(0x80, 0x90)):
920 self.assertRaises(UnicodeDecodeError,
921 ('\xF0'+cb+'\x80\x80').decode, 'utf-8')
922 self.assertRaises(UnicodeDecodeError,
923 ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
924 for cb in map(chr, range(0x90, 0xC0)):
925 self.assertRaises(UnicodeDecodeError,
926 ('\xF4'+cb+'\x80\x80').decode, 'utf-8')
927 self.assertRaises(UnicodeDecodeError,
928 ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
929
930 def test_issue8271(self):
Ezio Melottiab2eb0e2010-06-05 19:21:32 +0000931 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
932 # only the start byte and the continuation byte(s) are now considered
933 # invalid, instead of the number of bytes specified by the start byte.
Ezio Melottie57e50c2010-06-05 17:51:07 +0000934 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
935 # table 3-8, Row 2) for more information about the algorithm used.
936 FFFD = u'\ufffd'
937 sequences = [
938 # invalid start bytes
939 ('\x80', FFFD), # continuation byte
940 ('\x80\x80', FFFD*2), # 2 continuation bytes
941 ('\xc0', FFFD),
942 ('\xc0\xc0', FFFD*2),
943 ('\xc1', FFFD),
944 ('\xc1\xc0', FFFD*2),
945 ('\xc0\xc1', FFFD*2),
946 # with start byte of a 2-byte sequence
947 ('\xc2', FFFD), # only the start byte
948 ('\xc2\xc2', FFFD*2), # 2 start bytes
949 ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
950 ('\xc2\x41', FFFD+'A'), # invalid continuation byte
951 # with start byte of a 3-byte sequence
952 ('\xe1', FFFD), # only the start byte
953 ('\xe1\xe1', FFFD*2), # 2 start bytes
954 ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
955 ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
956 ('\xe1\x80', FFFD), # only 1 continuation byte
957 ('\xe1\x41', FFFD+'A'), # invalid continuation byte
958 ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
959 ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
960 ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
961 ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
962 ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
963 # with start byte of a 4-byte sequence
964 ('\xf1', FFFD), # only the start byte
965 ('\xf1\xf1', FFFD*2), # 2 start bytes
966 ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
967 ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
968 ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
969 ('\xf1\x80', FFFD), # only 1 continuation bytes
970 ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
971 ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
972 ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
973 ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
974 ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
975 ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
976 ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
977 ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
978 ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
979 ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
980 ('\xf1\xf1\x80\x41', FFFD*2+'A'),
981 ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
982 # with invalid start byte of a 4-byte sequence (rfc2279)
983 ('\xf5', FFFD), # only the start byte
984 ('\xf5\xf5', FFFD*2), # 2 start bytes
985 ('\xf5\x80', FFFD*2), # only 1 continuation byte
986 ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
987 ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
988 ('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
989 ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
990 ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
991 # with invalid start byte of a 5-byte sequence (rfc2279)
992 ('\xf8', FFFD), # only the start byte
993 ('\xf8\xf8', FFFD*2), # 2 start bytes
994 ('\xf8\x80', FFFD*2), # only one continuation byte
995 ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
996 ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
997 # with invalid start byte of a 6-byte sequence (rfc2279)
998 ('\xfc', FFFD), # only the start byte
999 ('\xfc\xfc', FFFD*2), # 2 start bytes
1000 ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1001 ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1002 # invalid start byte
1003 ('\xfe', FFFD),
1004 ('\xfe\x80\x80', FFFD*3),
1005 # other sequences
1006 ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
1007 ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
1008 ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
1009 ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1010 u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1011 ]
1012 for n, (seq, res) in enumerate(sequences):
1013 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1014 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1015 self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
1016 self.assertEqual(seq.decode('utf-8', 'ignore'),
1017 res.replace(u'\uFFFD', ''))
1018
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001019 def test_codecs_idna(self):
1020 # Test whether trailing dot is preserved
1021 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
1022
Walter Dörwald28256f22003-01-19 16:59:20 +00001023 def test_codecs_errors(self):
1024 # Error handling (encoding)
1025 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
1026 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
1027 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
1028 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
Benjamin Peterson332d7212009-09-18 21:14:55 +00001029 self.assertEqual(u'Andr\202 x'.encode('ascii', 'replace'),
1030 u'Andr\202 x'.encode('ascii', errors='replace'))
1031 self.assertEqual(u'Andr\202 x'.encode('ascii', 'ignore'),
1032 u'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033
Walter Dörwald28256f22003-01-19 16:59:20 +00001034 # Error handling (decoding)
1035 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
1036 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
1037 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
1038 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001039 self.assertEqual(unicode('\202 x', 'ascii', 'replace'), u'\uFFFD x')
Benjamin Peterson332d7212009-09-18 21:14:55 +00001040 self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
1041 u'abcde'.decode('ascii', errors='ignore'))
1042 self.assertEqual(u'abcde'.decode('ascii', 'replace'),
1043 u'abcde'.decode(encoding='ascii', errors='replace'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001044
Walter Dörwald28256f22003-01-19 16:59:20 +00001045 # Error handling (unknown character names)
1046 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001047
Walter Dörwald28256f22003-01-19 16:59:20 +00001048 # Error handling (truncated escape sequence)
1049 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001050
Walter Dörwald28256f22003-01-19 16:59:20 +00001051 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
1052 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
1053 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
1054 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
1055 # executes PyUnicode_Encode()
1056 import imp
1057 self.assertRaises(
1058 ImportError,
1059 imp.find_module,
1060 "non-existing module",
1061 [u"non-existing dir"]
1062 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001063
Walter Dörwald28256f22003-01-19 16:59:20 +00001064 # Error handling (wrong arguments)
1065 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001066
Walter Dörwald28256f22003-01-19 16:59:20 +00001067 # Error handling (PyUnicode_EncodeDecimal())
1068 self.assertRaises(UnicodeError, int, u"\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +00001069
Walter Dörwald28256f22003-01-19 16:59:20 +00001070 def test_codecs(self):
1071 # Encoding
1072 self.assertEqual(u'hello'.encode('ascii'), 'hello')
1073 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
1074 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
1075 self.assertEqual(u'hello'.encode('utf8'), 'hello')
1076 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
1077 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
1078 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00001079
Walter Dörwald28256f22003-01-19 16:59:20 +00001080 # Roundtrip safety for BMP (just the first 1024 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001081 for c in xrange(1024):
1082 u = unichr(c)
1083 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1084 'utf-16-be', 'raw_unicode_escape',
1085 'unicode_escape', 'unicode_internal'):
1086 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001087
Walter Dörwald28256f22003-01-19 16:59:20 +00001088 # Roundtrip safety for BMP (just the first 256 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001089 for c in xrange(256):
1090 u = unichr(c)
1091 for encoding in ('latin-1',):
1092 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001093
Walter Dörwald28256f22003-01-19 16:59:20 +00001094 # Roundtrip safety for BMP (just the first 128 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001095 for c in xrange(128):
1096 u = unichr(c)
1097 for encoding in ('ascii',):
1098 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001099
Walter Dörwald28256f22003-01-19 16:59:20 +00001100 # Roundtrip safety for non-BMP (just a few chars)
1101 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
1102 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1103 #'raw_unicode_escape',
1104 'unicode_escape', 'unicode_internal'):
1105 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001106
Walter Dörwald28256f22003-01-19 16:59:20 +00001107 # UTF-8 must be roundtrip safe for all UCS-2 code points
1108 # This excludes surrogates: in the full range, there would be
1109 # a surrogate pair (\udbff\udc00), which gets converted back
1110 # to a non-BMP character (\U0010fc00)
1111 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
1112 for encoding in ('utf-8',):
1113 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001114
Walter Dörwald28256f22003-01-19 16:59:20 +00001115 def test_codecs_charmap(self):
1116 # 0-127
1117 s = ''.join(map(chr, xrange(128)))
1118 for encoding in (
1119 'cp037', 'cp1026',
Georg Brandlf0757a22010-05-24 21:29:07 +00001120 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1121 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001122 'cp863', 'cp865', 'cp866',
1123 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1124 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1125 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1126 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001127
Walter Dörwald28256f22003-01-19 16:59:20 +00001128 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1129 'cp1256', 'cp1257', 'cp1258',
1130 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001131
Walter Dörwald28256f22003-01-19 16:59:20 +00001132 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1133 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001134
Walter Dörwald28256f22003-01-19 16:59:20 +00001135 ### These have undefined mappings:
1136 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001137
Walter Dörwald28256f22003-01-19 16:59:20 +00001138 ### These fail the round-trip:
1139 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00001140
Walter Dörwald28256f22003-01-19 16:59:20 +00001141 ):
1142 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001143
Walter Dörwald28256f22003-01-19 16:59:20 +00001144 # 128-255
1145 s = ''.join(map(chr, xrange(128, 256)))
1146 for encoding in (
1147 'cp037', 'cp1026',
Georg Brandlf0757a22010-05-24 21:29:07 +00001148 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1149 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001150 'cp863', 'cp865', 'cp866',
1151 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1152 'iso8859_2', 'iso8859_4', 'iso8859_5',
1153 'iso8859_9', 'koi8_r', 'latin_1',
1154 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00001155
Walter Dörwald28256f22003-01-19 16:59:20 +00001156 ### These have undefined mappings:
1157 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1158 #'cp1256', 'cp1257', 'cp1258',
1159 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1160 #'iso8859_3', 'iso8859_6', 'iso8859_7',
1161 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00001162
Walter Dörwald28256f22003-01-19 16:59:20 +00001163 ### These fail the round-trip:
1164 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00001165
Walter Dörwald28256f22003-01-19 16:59:20 +00001166 ):
1167 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001168
Walter Dörwald28256f22003-01-19 16:59:20 +00001169 def test_concatenation(self):
1170 self.assertEqual((u"abc" u"def"), u"abcdef")
1171 self.assertEqual(("abc" u"def"), u"abcdef")
1172 self.assertEqual((u"abc" "def"), u"abcdef")
1173 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
1174 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00001175
Walter Dörwald28256f22003-01-19 16:59:20 +00001176 def test_printing(self):
1177 class BitBucket:
1178 def write(self, text):
1179 pass
Fred Drake004d5e62000-10-23 17:22:08 +00001180
Walter Dörwald28256f22003-01-19 16:59:20 +00001181 out = BitBucket()
1182 print >>out, u'abc'
1183 print >>out, u'abc', u'def'
1184 print >>out, u'abc', 'def'
1185 print >>out, 'abc', u'def'
1186 print >>out, u'abc\n'
1187 print >>out, u'abc\n',
1188 print >>out, u'abc\n',
1189 print >>out, u'def\n'
1190 print >>out, u'def\n'
Fred Drake004d5e62000-10-23 17:22:08 +00001191
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001192 def test_ucs4(self):
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001193 x = u'\U00100000'
1194 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
1195 self.assertEqual(x, y)
1196
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00001197 y = r'\U00100000'
1198 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1199 self.assertEqual(x, y)
1200 y = r'\U00010000'
1201 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1202 self.assertEqual(x, y)
1203
1204 try:
1205 '\U11111111'.decode("raw-unicode-escape")
1206 except UnicodeDecodeError as e:
1207 self.assertEqual(e.start, 0)
1208 self.assertEqual(e.end, 10)
1209 else:
1210 self.fail("Should have raised UnicodeDecodeError")
1211
Brett Cannonc3647ac2005-04-26 03:45:26 +00001212 def test_conversion(self):
1213 # Make sure __unicode__() works properly
1214 class Foo0:
1215 def __str__(self):
1216 return "foo"
1217
1218 class Foo1:
1219 def __unicode__(self):
1220 return u"foo"
1221
1222 class Foo2(object):
1223 def __unicode__(self):
1224 return u"foo"
1225
1226 class Foo3(object):
1227 def __unicode__(self):
1228 return "foo"
1229
1230 class Foo4(str):
1231 def __unicode__(self):
1232 return "foo"
1233
1234 class Foo5(unicode):
1235 def __unicode__(self):
1236 return "foo"
1237
1238 class Foo6(str):
1239 def __str__(self):
1240 return "foos"
1241
1242 def __unicode__(self):
1243 return u"foou"
1244
1245 class Foo7(unicode):
1246 def __str__(self):
1247 return "foos"
1248 def __unicode__(self):
1249 return u"foou"
1250
1251 class Foo8(unicode):
1252 def __new__(cls, content=""):
1253 return unicode.__new__(cls, 2*content)
1254 def __unicode__(self):
1255 return self
1256
1257 class Foo9(unicode):
1258 def __str__(self):
1259 return "string"
1260 def __unicode__(self):
1261 return "not unicode"
1262
1263 self.assertEqual(unicode(Foo0()), u"foo")
1264 self.assertEqual(unicode(Foo1()), u"foo")
1265 self.assertEqual(unicode(Foo2()), u"foo")
1266 self.assertEqual(unicode(Foo3()), u"foo")
1267 self.assertEqual(unicode(Foo4("bar")), u"foo")
1268 self.assertEqual(unicode(Foo5("bar")), u"foo")
1269 self.assertEqual(unicode(Foo6("bar")), u"foou")
1270 self.assertEqual(unicode(Foo7("bar")), u"foou")
1271 self.assertEqual(unicode(Foo8("foo")), u"foofoo")
1272 self.assertEqual(str(Foo9("foo")), "string")
1273 self.assertEqual(unicode(Foo9("foo")), u"not unicode")
1274
Anthony Baxter67b6d512006-03-30 10:54:07 +00001275 def test_unicode_repr(self):
1276 class s1:
1277 def __repr__(self):
1278 return '\\n'
1279
1280 class s2:
1281 def __repr__(self):
1282 return u'\\n'
1283
1284 self.assertEqual(repr(s1()), '\\n')
1285 self.assertEqual(repr(s2()), '\\n')
1286
Zachary Ware1f702212013-12-10 14:09:20 -06001287 # This test only affects 32-bit platforms because expandtabs can only take
1288 # an int as the max value, not a 64-bit C long. If expandtabs is changed
1289 # to take a 64-bit long, this test should apply to all platforms.
1290 @unittest.skipIf(sys.maxint > (1 << 32) or struct.calcsize('P') != 4,
1291 'only applies to 32-bit platforms')
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00001292 def test_expandtabs_overflows_gracefully(self):
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00001293 self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
Anthony Baxter67b6d512006-03-30 10:54:07 +00001294
Eric Smitha9f7d622008-02-17 19:46:49 +00001295 def test__format__(self):
1296 def test(value, format, expected):
1297 # test both with and without the trailing 's'
1298 self.assertEqual(value.__format__(format), expected)
1299 self.assertEqual(value.__format__(format + u's'), expected)
1300
1301 test(u'', u'', u'')
1302 test(u'abc', u'', u'abc')
1303 test(u'abc', u'.3', u'abc')
1304 test(u'ab', u'.3', u'ab')
1305 test(u'abcdef', u'.3', u'abc')
1306 test(u'abcdef', u'.0', u'')
1307 test(u'abc', u'3.3', u'abc')
1308 test(u'abc', u'2.3', u'abc')
1309 test(u'abc', u'2.2', u'ab')
1310 test(u'abc', u'3.2', u'ab ')
1311 test(u'result', u'x<0', u'result')
1312 test(u'result', u'x<5', u'result')
1313 test(u'result', u'x<6', u'result')
1314 test(u'result', u'x<7', u'resultx')
1315 test(u'result', u'x<8', u'resultxx')
1316 test(u'result', u' <7', u'result ')
1317 test(u'result', u'<7', u'result ')
1318 test(u'result', u'>7', u' result')
1319 test(u'result', u'>8', u' result')
1320 test(u'result', u'^8', u' result ')
1321 test(u'result', u'^9', u' result ')
1322 test(u'result', u'^10', u' result ')
1323 test(u'a', u'10000', u'a' + u' ' * 9999)
1324 test(u'', u'10000', u' ' * 10000)
1325 test(u'', u'10000000', u' ' * 10000000)
1326
1327 # test mixing unicode and str
1328 self.assertEqual(u'abc'.__format__('s'), u'abc')
1329 self.assertEqual(u'abc'.__format__('->10s'), u'-------abc')
1330
1331 def test_format(self):
1332 self.assertEqual(u''.format(), u'')
1333 self.assertEqual(u'a'.format(), u'a')
1334 self.assertEqual(u'ab'.format(), u'ab')
1335 self.assertEqual(u'a{{'.format(), u'a{')
1336 self.assertEqual(u'a}}'.format(), u'a}')
1337 self.assertEqual(u'{{b'.format(), u'{b')
1338 self.assertEqual(u'}}b'.format(), u'}b')
1339 self.assertEqual(u'a{{b'.format(), u'a{b')
1340
1341 # examples from the PEP:
1342 import datetime
1343 self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred")
1344 self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')),
1345 u"My name is Fred")
1346 self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'),
1347 u"My name is Fred :-{}")
1348
1349 # datetime.__format__ doesn't work with unicode
1350 #d = datetime.date(2007, 8, 18)
1351 #self.assertEqual("The year is {0.year}".format(d),
1352 # "The year is 2007")
1353
1354 # classes we'll use for testing
1355 class C:
1356 def __init__(self, x=100):
1357 self._x = x
1358 def __format__(self, spec):
1359 return spec
1360
1361 class D:
1362 def __init__(self, x):
1363 self.x = x
1364 def __format__(self, spec):
1365 return str(self.x)
1366
1367 # class with __str__, but no __format__
1368 class E:
1369 def __init__(self, x):
1370 self.x = x
1371 def __str__(self):
1372 return u'E(' + self.x + u')'
1373
1374 # class with __repr__, but no __format__ or __str__
1375 class F:
1376 def __init__(self, x):
1377 self.x = x
1378 def __repr__(self):
1379 return u'F(' + self.x + u')'
1380
1381 # class with __format__ that forwards to string, for some format_spec's
1382 class G:
1383 def __init__(self, x):
1384 self.x = x
1385 def __str__(self):
1386 return u"string is " + self.x
1387 def __format__(self, format_spec):
1388 if format_spec == 'd':
1389 return u'G(' + self.x + u')'
1390 return object.__format__(self, format_spec)
1391
1392 # class that returns a bad type from __format__
1393 class H:
1394 def __format__(self, format_spec):
1395 return 1.0
1396
1397 class I(datetime.date):
1398 def __format__(self, format_spec):
1399 return self.strftime(format_spec)
1400
1401 class J(int):
1402 def __format__(self, format_spec):
1403 return int.__format__(self * 2, format_spec)
1404
1405
1406 self.assertEqual(u''.format(), u'')
1407 self.assertEqual(u'abc'.format(), u'abc')
1408 self.assertEqual(u'{0}'.format(u'abc'), u'abc')
1409 self.assertEqual(u'{0:}'.format(u'abc'), u'abc')
1410 self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc')
1411 self.assertEqual(u'{0}X'.format(u'abc'), u'abcX')
1412 self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY')
1413 self.assertEqual(u'{1}'.format(1, u'abc'), u'abc')
1414 self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc')
1415 self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX')
1416 self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY')
1417 self.assertEqual(u'{0}'.format(-15), u'-15')
1418 self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc')
1419 self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc')
1420 self.assertEqual(u'{{'.format(), u'{')
1421 self.assertEqual(u'}}'.format(), u'}')
1422 self.assertEqual(u'{{}}'.format(), u'{}')
1423 self.assertEqual(u'{{x}}'.format(), u'{x}')
1424 self.assertEqual(u'{{{0}}}'.format(123), u'{123}')
1425 self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}')
1426 self.assertEqual(u'}}{{'.format(), u'}{')
1427 self.assertEqual(u'}}x{{'.format(), u'}x{')
1428
1429 # weird field names
1430 self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz')
1431 self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz')
1432 self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3')
1433
1434 self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20')
1435 self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010')
1436 self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc')
1437 self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc')
1438 self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def')
1439 self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def')
1440 self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def')
1441
1442 # strings
1443 self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc')
1444 self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab')
1445 self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc')
1446 self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'')
1447 self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc')
1448 self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc')
1449 self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab')
1450 self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ')
1451 self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result')
1452 self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result')
1453 self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result')
1454 self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx')
1455 self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx')
1456 self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ')
1457 self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ')
1458 self.assertEqual(u'{0:>7s}'.format(u'result'), u' result')
1459 self.assertEqual(u'{0:>8s}'.format(u'result'), u' result')
1460 self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ')
1461 self.assertEqual(u'{0:^9s}'.format(u'result'), u' result ')
1462 self.assertEqual(u'{0:^10s}'.format(u'result'), u' result ')
1463 self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999)
1464 self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000)
1465 self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000)
1466
Eric V. Smith9a55cd82014-04-14 11:22:33 -04001467 # issue 12546: use \x00 as a fill character
1468 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1469 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1470 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1471 self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1472
1473 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1474 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1475 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1476 self.assertEqual('{0:<6}'.format(3), '3 ')
1477
1478 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1479 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1480 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1481 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1482
1483 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1484 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1485 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1486 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1487
Eric Smitha9f7d622008-02-17 19:46:49 +00001488 # format specifiers for user defined type
1489 self.assertEqual(u'{0:abc}'.format(C()), u'abc')
1490
Florent Xicluna60d512c2010-09-13 08:21:43 +00001491 # !r and !s coercions
Eric Smitha9f7d622008-02-17 19:46:49 +00001492 self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello')
1493 self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello')
1494 self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello ')
1495 self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello ')
1496 self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'")
1497 self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'")
1498 self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)')
1499
1500 # test fallback to object.__format__
1501 self.assertEqual(u'{0}'.format({}), u'{}')
1502 self.assertEqual(u'{0}'.format([]), u'[]')
1503 self.assertEqual(u'{0}'.format([1]), u'[1]')
1504 self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)')
Eric Smitha9f7d622008-02-17 19:46:49 +00001505 self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)')
Eric Smitha9f7d622008-02-17 19:46:49 +00001506 self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data')
1507
Florent Xicluna60d512c2010-09-13 08:21:43 +00001508 msg = 'object.__format__ with a non-empty format string is deprecated'
1509 with test_support.check_warnings((msg, PendingDeprecationWarning)):
1510 self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data) ')
1511 self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data) ')
1512 self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')
1513
Eric Smith2ace4cf2009-03-14 14:37:38 +00001514 self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007,
1515 month=8,
1516 day=27)),
1517 u"date: 2007-08-27")
Eric Smitha9f7d622008-02-17 19:46:49 +00001518
1519 # test deriving from a builtin type and overriding __format__
Eric Smith2ace4cf2009-03-14 14:37:38 +00001520 self.assertEqual(u"{0}".format(J(10)), u"20")
Eric Smitha9f7d622008-02-17 19:46:49 +00001521
1522
1523 # string format specifiers
Eric Smith2ace4cf2009-03-14 14:37:38 +00001524 self.assertEqual(u'{0:}'.format('a'), u'a')
Eric Smitha9f7d622008-02-17 19:46:49 +00001525
1526 # computed format specifiers
Eric Smith2ace4cf2009-03-14 14:37:38 +00001527 self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello')
1528 self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello')
1529 self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello')
1530 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello ')
1531 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello ')
Eric Smitha9f7d622008-02-17 19:46:49 +00001532
1533 # test various errors
Eric Smith2ace4cf2009-03-14 14:37:38 +00001534 self.assertRaises(ValueError, u'{'.format)
1535 self.assertRaises(ValueError, u'}'.format)
1536 self.assertRaises(ValueError, u'a{'.format)
1537 self.assertRaises(ValueError, u'a}'.format)
1538 self.assertRaises(ValueError, u'{a'.format)
1539 self.assertRaises(ValueError, u'}a'.format)
1540 self.assertRaises(IndexError, u'{0}'.format)
1541 self.assertRaises(IndexError, u'{1}'.format, u'abc')
1542 self.assertRaises(KeyError, u'{x}'.format)
1543 self.assertRaises(ValueError, u"}{".format)
1544 self.assertRaises(ValueError, u"{".format)
1545 self.assertRaises(ValueError, u"}".format)
1546 self.assertRaises(ValueError, u"abc{0:{}".format)
1547 self.assertRaises(ValueError, u"{0".format)
1548 self.assertRaises(IndexError, u"{0.}".format)
1549 self.assertRaises(ValueError, u"{0.}".format, 0)
1550 self.assertRaises(IndexError, u"{0[}".format)
1551 self.assertRaises(ValueError, u"{0[}".format, [])
1552 self.assertRaises(KeyError, u"{0]}".format)
1553 self.assertRaises(ValueError, u"{0.[]}".format, 0)
1554 self.assertRaises(ValueError, u"{0..foo}".format, 0)
1555 self.assertRaises(ValueError, u"{0[0}".format, 0)
1556 self.assertRaises(ValueError, u"{0[0:foo}".format, 0)
1557 self.assertRaises(KeyError, u"{c]}".format)
1558 self.assertRaises(ValueError, u"{{ {{{0}}".format, 0)
1559 self.assertRaises(ValueError, u"{0}}".format, 0)
1560 self.assertRaises(KeyError, u"{foo}".format, bar=3)
1561 self.assertRaises(ValueError, u"{0!x}".format, 3)
1562 self.assertRaises(ValueError, u"{0!}".format, 0)
1563 self.assertRaises(ValueError, u"{0!rs}".format, 0)
1564 self.assertRaises(ValueError, u"{!}".format)
1565 self.assertRaises(IndexError, u"{:}".format)
1566 self.assertRaises(IndexError, u"{:s}".format)
1567 self.assertRaises(IndexError, u"{}".format)
Benjamin Petersoneabdeba2010-06-07 22:33:09 +00001568 big = u"23098475029384702983476098230754973209482573"
1569 self.assertRaises(ValueError, (u"{" + big + u"}").format)
1570 self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0])
Eric Smitha9f7d622008-02-17 19:46:49 +00001571
Eric Smith4b94b192009-05-23 13:56:13 +00001572 # issue 6089
1573 self.assertRaises(ValueError, u"{0[0]x}".format, [None])
1574 self.assertRaises(ValueError, u"{0[0](10)}".format, [None])
1575
Eric Smitha9f7d622008-02-17 19:46:49 +00001576 # can't have a replacement on the field name portion
Eric Smith2ace4cf2009-03-14 14:37:38 +00001577 self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4)
Eric Smitha9f7d622008-02-17 19:46:49 +00001578
1579 # exceed maximum recursion depth
Eric Smith2ace4cf2009-03-14 14:37:38 +00001580 self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'')
1581 self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
Eric Smitha9f7d622008-02-17 19:46:49 +00001582 0, 1, 2, 3, 4, 5, 6, 7)
1583
1584 # string format spec errors
Eric Smith2ace4cf2009-03-14 14:37:38 +00001585 self.assertRaises(ValueError, u"{0:-s}".format, u'')
1586 self.assertRaises(ValueError, format, u"", u"-")
1587 self.assertRaises(ValueError, u"{0:=s}".format, u'')
Anthony Baxter67b6d512006-03-30 10:54:07 +00001588
Eric Smithbc32fee2008-02-18 18:02:34 +00001589 # test combining string and unicode
1590 self.assertEqual(u"foo{0}".format('bar'), u'foobar')
1591 # This will try to convert the argument from unicode to str, which
1592 # will succeed
1593 self.assertEqual("foo{0}".format(u'bar'), 'foobar')
1594 # This will try to convert the argument from unicode to str, which
1595 # will fail
1596 self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar')
1597
Mark Dickinson75d36002012-10-28 10:00:46 +00001598 def test_format_huge_precision(self):
1599 format_string = u".{}f".format(sys.maxsize + 1)
1600 with self.assertRaises(ValueError):
1601 result = format(2.34, format_string)
1602
1603 def test_format_huge_width(self):
1604 format_string = u"{}f".format(sys.maxsize + 1)
1605 with self.assertRaises(ValueError):
1606 result = format(2.34, format_string)
1607
1608 def test_format_huge_item_number(self):
1609 format_string = u"{{{}:.6f}}".format(sys.maxsize + 1)
1610 with self.assertRaises(ValueError):
1611 result = format_string.format(2.34)
1612
Eric Smith6f42edb2009-03-14 11:57:26 +00001613 def test_format_auto_numbering(self):
1614 class C:
1615 def __init__(self, x=100):
1616 self._x = x
1617 def __format__(self, spec):
1618 return spec
1619
1620 self.assertEqual(u'{}'.format(10), u'10')
1621 self.assertEqual(u'{:5}'.format('s'), u's ')
1622 self.assertEqual(u'{!r}'.format('s'), u"'s'")
1623 self.assertEqual(u'{._x}'.format(C(10)), u'10')
1624 self.assertEqual(u'{[1]}'.format([1, 2]), u'2')
1625 self.assertEqual(u'{[a]}'.format({'a':4, 'b':2}), u'4')
1626 self.assertEqual(u'a{}b{}c'.format(0, 1), u'a0b1c')
1627
1628 self.assertEqual(u'a{:{}}b'.format('x', '^10'), u'a x b')
1629 self.assertEqual(u'a{:{}x}b'.format(20, '#'), u'a0x14b')
1630
1631 # can't mix and match numbering and auto-numbering
1632 self.assertRaises(ValueError, u'{}{1}'.format, 1, 2)
1633 self.assertRaises(ValueError, u'{1}{}'.format, 1, 2)
1634 self.assertRaises(ValueError, u'{:{1}}'.format, 1, 2)
1635 self.assertRaises(ValueError, u'{0:{}}'.format, 1, 2)
1636
1637 # can mix and match auto-numbering and named
1638 self.assertEqual(u'{f}{}'.format(4, f='test'), u'test4')
1639 self.assertEqual(u'{}{f}'.format(4, f='test'), u'4test')
1640 self.assertEqual(u'{:{f}}{g}{}'.format(1, 3, g='g', f=2), u' 1g3')
1641 self.assertEqual(u'{f:{}}{}{g}'.format(2, 4, f=1, g='g'), u' 14g')
1642
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +00001643 def test_raiseMemError(self):
1644 # Ensure that the freelist contains a consistent object, even
1645 # when a string allocation fails with a MemoryError.
1646 # This used to crash the interpreter,
1647 # or leak references when the number was smaller.
Antoine Pitrou187ac1b2008-09-05 22:04:54 +00001648 charwidth = 4 if sys.maxunicode >= 0x10000 else 2
1649 # Note: sys.maxsize is half of the actual max allocation because of
1650 # the signedness of Py_ssize_t.
1651 alloc = lambda: u"a" * (sys.maxsize // charwidth * 2)
Antoine Pitroufd7c43e2008-08-17 17:01:49 +00001652 self.assertRaises(MemoryError, alloc)
1653 self.assertRaises(MemoryError, alloc)
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +00001654
Victor Stinner95affc42010-03-22 12:24:37 +00001655 def test_format_subclass(self):
1656 class U(unicode):
Victor Stinner95affc42010-03-22 12:24:37 +00001657 def __unicode__(self):
1658 return u'__unicode__ overridden'
1659 u = U(u'xxx')
Florent Xicluna9b90cd12010-09-13 07:46:37 +00001660 self.assertEqual("%s" % u, u'__unicode__ overridden')
Florent Xiclunac0c0b142010-09-13 08:53:00 +00001661 self.assertEqual("{}".format(u), '__unicode__ overridden')
Victor Stinner95affc42010-03-22 12:24:37 +00001662
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001663 # Test PyUnicode_FromFormat()
1664 def test_from_format(self):
1665 test_support.import_module('ctypes')
1666 from ctypes import (
1667 pythonapi, py_object, sizeof,
1668 c_int, c_long, c_longlong, c_ssize_t,
1669 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
1670 if sys.maxunicode == 0xffff:
1671 name = "PyUnicodeUCS2_FromFormat"
1672 else:
1673 name = "PyUnicodeUCS4_FromFormat"
1674 _PyUnicode_FromFormat = getattr(pythonapi, name)
1675 _PyUnicode_FromFormat.restype = py_object
1676
1677 def PyUnicode_FromFormat(format, *args):
1678 cargs = tuple(
1679 py_object(arg) if isinstance(arg, unicode) else arg
1680 for arg in args)
1681 return _PyUnicode_FromFormat(format, *cargs)
1682
1683 def check_format(expected, format, *args):
1684 text = PyUnicode_FromFormat(format, *args)
1685 self.assertEqual(expected, text)
1686
1687 # ascii format, non-ascii argument
1688 check_format(u'ascii\x7f=unicode\xe9',
1689 b'ascii\x7f=%U', u'unicode\xe9')
1690
1691 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
1692 # raises an error
1693 #self.assertRaisesRegex(ValueError,
1694 # '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
1695 # 'string, got a non-ASCII byte: 0xe9$',
1696 # PyUnicode_FromFormat, b'unicode\xe9=%s', u'ascii')
1697
1698 # test "%c"
1699 check_format(u'\uabcd',
1700 b'%c', c_int(0xabcd))
1701 if sys.maxunicode > 0xffff:
1702 check_format(u'\U0010ffff',
1703 b'%c', c_int(0x10ffff))
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +02001704 else:
1705 with self.assertRaises(OverflowError):
1706 PyUnicode_FromFormat(b'%c', c_int(0x10000))
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001707 with self.assertRaises(OverflowError):
1708 PyUnicode_FromFormat(b'%c', c_int(0x110000))
1709 # Issue #18183
1710 if sys.maxunicode > 0xffff:
1711 check_format(u'\U00010000\U00100000',
1712 b'%c%c', c_int(0x10000), c_int(0x100000))
1713
1714 # test "%"
1715 check_format(u'%',
1716 b'%')
1717 check_format(u'%',
1718 b'%%')
1719 check_format(u'%s',
1720 b'%%s')
1721 check_format(u'[%]',
1722 b'[%%]')
1723 check_format(u'%abc',
1724 b'%%%s', b'abc')
1725
1726 # test %S
1727 check_format(u"repr=abc",
1728 b'repr=%S', u'abc')
1729
1730 # test %R
1731 check_format(u"repr=u'abc'",
1732 b'repr=%R', u'abc')
1733
1734 # test integer formats (%i, %d, %u)
1735 check_format(u'010',
1736 b'%03i', c_int(10))
1737 check_format(u'0010',
1738 b'%0.4i', c_int(10))
1739 check_format(u'-123',
1740 b'%i', c_int(-123))
1741
1742 check_format(u'-123',
1743 b'%d', c_int(-123))
1744 check_format(u'-123',
1745 b'%ld', c_long(-123))
1746 check_format(u'-123',
1747 b'%zd', c_ssize_t(-123))
1748
1749 check_format(u'123',
1750 b'%u', c_uint(123))
1751 check_format(u'123',
1752 b'%lu', c_ulong(123))
1753 check_format(u'123',
1754 b'%zu', c_size_t(123))
1755
1756 # test long output
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +02001757 min_long = -(2 ** (8 * sizeof(c_long) - 1))
1758 max_long = -min_long - 1
1759 check_format(unicode(min_long),
1760 b'%ld', c_long(min_long))
1761 check_format(unicode(max_long),
1762 b'%ld', c_long(max_long))
1763 max_ulong = 2 ** (8 * sizeof(c_ulong)) - 1
1764 check_format(unicode(max_ulong),
1765 b'%lu', c_ulong(max_ulong))
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001766 PyUnicode_FromFormat(b'%p', c_void_p(-1))
1767
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +02001768 # test padding (width and/or precision)
1769 check_format(u'123'.rjust(10, u'0'),
1770 b'%010i', c_int(123))
1771 check_format(u'123'.rjust(100),
1772 b'%100i', c_int(123))
1773 check_format(u'123'.rjust(100, u'0'),
1774 b'%.100i', c_int(123))
1775 check_format(u'123'.rjust(80, u'0').rjust(100),
1776 b'%100.80i', c_int(123))
1777
1778 check_format(u'123'.rjust(10, u'0'),
1779 b'%010u', c_uint(123))
1780 check_format(u'123'.rjust(100),
1781 b'%100u', c_uint(123))
1782 check_format(u'123'.rjust(100, u'0'),
1783 b'%.100u', c_uint(123))
1784 check_format(u'123'.rjust(80, u'0').rjust(100),
1785 b'%100.80u', c_uint(123))
1786
1787 check_format(u'123'.rjust(10, u'0'),
1788 b'%010x', c_int(0x123))
1789 check_format(u'123'.rjust(100),
1790 b'%100x', c_int(0x123))
1791 check_format(u'123'.rjust(100, u'0'),
1792 b'%.100x', c_int(0x123))
1793 check_format(u'123'.rjust(80, u'0').rjust(100),
1794 b'%100.80x', c_int(0x123))
1795
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001796 # test %V
1797 check_format(u'repr=abc',
1798 b'repr=%V', u'abc', b'xyz')
1799 check_format(u'repr=\xe4\xba\xba\xe6\xb0\x91',
1800 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
1801 check_format(u'repr=abc\xff',
1802 b'repr=%V', None, b'abc\xff')
1803
1804 # not supported: copy the raw format string. these tests are just here
1805 # to check for crashs and should not be considered as specifications
1806 check_format(u'%s',
1807 b'%1%s', b'abc')
1808 check_format(u'%1abc',
1809 b'%1abc')
1810 check_format(u'%+i',
1811 b'%+i', c_int(10))
1812 check_format(u'%s',
1813 b'%.%s', b'abc')
1814
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001815 @test_support.cpython_only
Victor Stinner975134e2011-11-22 01:54:19 +01001816 def test_encode_decimal(self):
1817 from _testcapi import unicode_encodedecimal
1818 self.assertEqual(unicode_encodedecimal(u'123'),
1819 b'123')
1820 self.assertEqual(unicode_encodedecimal(u'\u0663.\u0661\u0664'),
1821 b'3.14')
1822 self.assertEqual(unicode_encodedecimal(u"\N{EM SPACE}3.14\N{EN SPACE}"),
1823 b' 3.14 ')
1824 self.assertRaises(UnicodeEncodeError,
1825 unicode_encodedecimal, u"123\u20ac", "strict")
1826 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "replace"),
1827 b'123?')
1828 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "ignore"),
1829 b'123')
1830 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "xmlcharrefreplace"),
1831 b'123&#8364;')
1832 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "backslashreplace"),
1833 b'123\\u20ac')
1834 self.assertEqual(unicode_encodedecimal(u"123\u20ac\N{EM SPACE}", "replace"),
1835 b'123? ')
1836 self.assertEqual(unicode_encodedecimal(u"123\u20ac\u20ac", "replace"),
1837 b'123??')
1838 self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"),
1839 b'123?0')
1840
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001841 @test_support.cpython_only
Serhiy Storchakae822b032013-08-06 16:56:26 +03001842 def test_encode_decimal_with_surrogates(self):
1843 from _testcapi import unicode_encodedecimal
1844 tests = [(u'\U0001f49d', '&#128157;'),
1845 (u'\ud83d', '&#55357;'),
1846 (u'\udc9d', '&#56477;'),
Serhiy Storchakae822b032013-08-06 16:56:26 +03001847 ]
Serhiy Storchaka1fdc7022013-10-31 17:06:03 +02001848 if u'\ud83d\udc9d' != u'\U0001f49d':
1849 tests += [(u'\ud83d\udc9d', '&#55357;&#56477;')]
Serhiy Storchakae822b032013-08-06 16:56:26 +03001850 for s, exp in tests:
1851 self.assertEqual(
1852 unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"),
1853 '123' + exp)
Victor Stinner95affc42010-03-22 12:24:37 +00001854
Walter Dörwald28256f22003-01-19 16:59:20 +00001855def test_main():
Collin Winterc2898c52007-04-25 17:29:52 +00001856 test_support.run_unittest(__name__)
Barry Warsaw817918c2002-08-06 16:58:21 +00001857
Walter Dörwald28256f22003-01-19 16:59:20 +00001858if __name__ == "__main__":
1859 test_main()