blob: 125fd56299468f3a749d8d3c842fd84fb5fa6516 [file] [log] [blame]
Martin v. Löwisa729daf2002-08-04 17:28:33 +00001# -*- coding: iso-8859-1 -*-
Guido van Rossuma831cac2000-03-10 23:23:21 +00002""" Test script for the Unicode implementation.
3
Guido van Rossuma831cac2000-03-10 23:23:21 +00004Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
Marc-André Lemburg36619082001-01-17 19:11:13 +00008"""#"
Neal Norwitz9d72bb42007-04-17 08:48:32 +00009import unittest, sys, codecs, new
Walter Dörwald0fd583c2003-02-21 12:53:50 +000010from test import test_support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000011
Neal Norwitz430f68b2005-11-24 22:00:56 +000012# Error handling (bad decoder return)
13def search_function(encoding):
14 def decode1(input, errors="strict"):
15 return 42 # not a tuple
16 def encode1(input, errors="strict"):
17 return 42 # not a tuple
18 def encode2(input, errors="strict"):
19 return (42, 42) # no unicode
20 def decode2(input, errors="strict"):
21 return (42, 42) # no unicode
22 if encoding=="test.unicode1":
23 return (encode1, decode1, None, None)
24 elif encoding=="test.unicode2":
25 return (encode2, decode2, None, None)
26 else:
27 return None
28codecs.register(search_function)
29
Walter Dörwald0fd583c2003-02-21 12:53:50 +000030class UnicodeTest(
31 string_tests.CommonTest,
Walter Dörwald57d88e52004-08-26 16:53:04 +000032 string_tests.MixinStrUnicodeUserStringTest,
33 string_tests.MixinStrUnicodeTest,
Walter Dörwald0fd583c2003-02-21 12:53:50 +000034 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000035 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000036
37 def checkequalnofix(self, result, object, methodname, *args):
38 method = getattr(object, methodname)
39 realresult = method(*args)
40 self.assertEqual(realresult, result)
41 self.assert_(type(realresult) is type(result))
42
43 # if the original is returned make sure that
44 # this doesn't happen with subclasses
45 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000046 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000047 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000048 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000049 object = usub(object)
50 method = getattr(object, methodname)
51 realresult = method(*args)
52 self.assertEqual(realresult, result)
53 self.assert_(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000054
Jeremy Hylton504de6b2003-10-06 05:08:26 +000055 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000056 self.assertEqual('\xff', '\u00ff')
57 self.assertEqual('\uffff', '\U0000ffff')
Jeremy Hylton504de6b2003-10-06 05:08:26 +000058 self.assertRaises(UnicodeError, eval, 'u\'\\Ufffffffe\'')
59 self.assertRaises(UnicodeError, eval, 'u\'\\Uffffffff\'')
60 self.assertRaises(UnicodeError, eval, 'u\'\\U%08x\'' % 0x110000)
61
Walter Dörwald28256f22003-01-19 16:59:20 +000062 def test_repr(self):
63 if not sys.platform.startswith('java'):
64 # Test basic sanity of repr()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000065 self.assertEqual(repr('abc'), "u'abc'")
66 self.assertEqual(repr('ab\\c'), "u'ab\\\\c'")
67 self.assertEqual(repr('ab\\'), "u'ab\\\\'")
68 self.assertEqual(repr('\\c'), "u'\\\\c'")
69 self.assertEqual(repr('\\'), "u'\\\\'")
70 self.assertEqual(repr('\n'), "u'\\n'")
71 self.assertEqual(repr('\r'), "u'\\r'")
72 self.assertEqual(repr('\t'), "u'\\t'")
73 self.assertEqual(repr('\b'), "u'\\x08'")
74 self.assertEqual(repr("'\""), """u'\\'"'""")
75 self.assertEqual(repr("'\""), """u'\\'"'""")
76 self.assertEqual(repr("'"), '''u"'"''')
77 self.assertEqual(repr('"'), """u'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +000078 latin1repr = (
79 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
80 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
81 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
82 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
83 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
84 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
85 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
86 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
87 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
88 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
89 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
90 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
91 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
92 "\\xfe\\xff'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000093 testrepr = repr(''.join(map(unichr, xrange(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +000094 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +000095 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
97 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +000098
Guido van Rossum49d6b072006-08-17 21:11:47 +000099 def test_iterators(self):
100 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000101 it = "\u1111\u2222\u3333".__iter__()
102 self.assertEqual(next(it), "\u1111")
103 self.assertEqual(next(it), "\u2222")
104 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000105 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000106
Walter Dörwald28256f22003-01-19 16:59:20 +0000107 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000108 string_tests.CommonTest.test_count(self)
109 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000110 self.checkequalnofix(3, 'aaa', 'count', 'a')
111 self.checkequalnofix(0, 'aaa', 'count', 'b')
112 self.checkequalnofix(3, 'aaa', 'count', 'a')
113 self.checkequalnofix(0, 'aaa', 'count', 'b')
114 self.checkequalnofix(0, 'aaa', 'count', 'b')
115 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
116 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
117 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
118 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000119
Walter Dörwald28256f22003-01-19 16:59:20 +0000120 def test_find(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000121 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
122 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
123 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000124
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000125 self.assertRaises(TypeError, 'hello'.find)
126 self.assertRaises(TypeError, 'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000127
Walter Dörwald28256f22003-01-19 16:59:20 +0000128 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000129 string_tests.CommonTest.test_rfind(self)
130 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000131 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
132 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
133 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000134
Walter Dörwald28256f22003-01-19 16:59:20 +0000135 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000136 string_tests.CommonTest.test_index(self)
137 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000138 for (t1, t2) in ((str, str), (str, str)):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000139 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
140 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
141 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
142 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
143 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
144 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
145 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
146 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000147
Walter Dörwald28256f22003-01-19 16:59:20 +0000148 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000149 string_tests.CommonTest.test_rindex(self)
150 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 for (t1, t2) in ((str, str), (str, str)):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000152 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
153 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
154 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
155 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000156
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000157 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
158 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
159 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
160 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
161 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000162
Walter Dörwald28256f22003-01-19 16:59:20 +0000163 def test_translate(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 self.checkequalnofix('bbbc', 'abababc', 'translate', {ord('a'):None})
165 self.checkequalnofix('iiic', 'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
166 self.checkequalnofix('iiix', 'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):'x'})
167 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', {ord('a'):None, ord('b'):'<i>'})
168 self.checkequalnofix('c', 'abababc', 'translate', {ord('a'):None, ord('b'):''})
169 self.checkequalnofix('xyyx', 'xzx', 'translate', {ord('z'):'yy'})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000170
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000171 self.assertRaises(TypeError, 'hello'.translate)
172 self.assertRaises(TypeError, 'abababc'.translate, {ord('a'):''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000173
Walter Dörwald28256f22003-01-19 16:59:20 +0000174 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000175 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000176
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000177 # Mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000178 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
179 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
180 self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000181
Walter Dörwald28256f22003-01-19 16:59:20 +0000182 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000183 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000184
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000185 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000186 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
187 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
188 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
189 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
190 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
191 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
192 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Marc-André Lemburge5034372000-08-08 08:04:29 +0000193
Walter Dörwald28256f22003-01-19 16:59:20 +0000194 def test_strip(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000195 string_tests.CommonTest.test_strip(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000196 self.assertRaises(UnicodeError, "hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000197
Walter Dörwald28256f22003-01-19 16:59:20 +0000198 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000199 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000200
Walter Dörwald28256f22003-01-19 16:59:20 +0000201 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000202 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
203 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000204
Walter Dörwald28256f22003-01-19 16:59:20 +0000205 def test_comparison(self):
206 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000207 self.assertEqual('abc', 'abc')
208 self.assertEqual('abc', 'abc')
209 self.assertEqual('abc', 'abc')
210 self.assert_('abcd' > 'abc')
211 self.assert_('abcd' > 'abc')
212 self.assert_('abcd' > 'abc')
213 self.assert_('abc' < 'abcd')
214 self.assert_('abc' < 'abcd')
215 self.assert_('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000216
217 if 0:
218 # Move these tests to a Unicode collation module test...
219 # Testing UTF-16 code point order comparisons...
220
221 # No surrogates, no fixup required.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000222 self.assert_('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000223 # Non surrogate below surrogate value, no fixup required
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000224 self.assert_('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000225
226 # Non surrogate above surrogate value, fixup required
227 def test_lecmp(s, s2):
228 self.assert_(s < s2)
229
230 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000231 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000232 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000233 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000234 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000235 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000236 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000237 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000238 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000240 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000241 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000242 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000243 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000244 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000245 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000246 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000247 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000248 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000249 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000250 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000252 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000253 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000254 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000255 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000256 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000257 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000258 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000259 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000260 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000262 test_lecmp(s, s2)
263
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 test_fixup('\ue000')
265 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000266
267 # Surrogates on both sides, no fixup required
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000268 self.assert_('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000269
Walter Dörwald28256f22003-01-19 16:59:20 +0000270 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000271 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000272 self.checkequalnofix(False, '\u1FFc', 'islower')
Walter Dörwald28256f22003-01-19 16:59:20 +0000273
274 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000275 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
276 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000277 self.checkequalnofix(False, '\u1FFc', 'isupper')
Walter Dörwald28256f22003-01-19 16:59:20 +0000278
279 def test_istitle(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000280 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000281 self.checkequalnofix(True, '\u1FFc', 'istitle')
282 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000283
284 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000285 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000286 self.checkequalnofix(True, '\u2000', 'isspace')
287 self.checkequalnofix(True, '\u200a', 'isspace')
288 self.checkequalnofix(False, '\u2014', 'isspace')
Walter Dörwald28256f22003-01-19 16:59:20 +0000289
290 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000291 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000292 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Walter Dörwald28256f22003-01-19 16:59:20 +0000293
294 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 self.checkequalnofix(False, '', 'isdecimal')
296 self.checkequalnofix(False, 'a', 'isdecimal')
297 self.checkequalnofix(True, '0', 'isdecimal')
298 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
299 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
300 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
301 self.checkequalnofix(True, '0123456789', 'isdecimal')
302 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000303
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000304 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000305
306 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000307 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000308 self.checkequalnofix(True, '\u2460', 'isdigit')
309 self.checkequalnofix(False, '\xbc', 'isdigit')
310 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000311
312 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000313 self.checkequalnofix(False, '', 'isnumeric')
314 self.checkequalnofix(False, 'a', 'isnumeric')
315 self.checkequalnofix(True, '0', 'isnumeric')
316 self.checkequalnofix(True, '\u2460', 'isnumeric')
317 self.checkequalnofix(True, '\xbc', 'isnumeric')
318 self.checkequalnofix(True, '\u0660', 'isnumeric')
319 self.checkequalnofix(True, '0123456789', 'isnumeric')
320 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000321
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000322 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000323
Walter Dörwald28256f22003-01-19 16:59:20 +0000324 def test_contains(self):
325 # Testing Unicode contains method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000326 self.assert_('a' in 'abdb')
327 self.assert_('a' in 'bdab')
328 self.assert_('a' in 'bdaba')
329 self.assert_('a' in 'bdba')
330 self.assert_('a' in 'bdba')
331 self.assert_('a' in 'bdba')
332 self.assert_('a' not in 'bdb')
333 self.assert_('a' not in 'bdb')
334 self.assert_('a' in 'bdba')
Walter Dörwald28256f22003-01-19 16:59:20 +0000335 self.assert_('a' in ('a',1,None))
336 self.assert_('a' in (1,None,'a'))
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000337 self.assert_('a' in (1,None,'a'))
338 self.assert_('a' in ('a',1,None))
339 self.assert_('a' in (1,None,'a'))
340 self.assert_('a' in (1,None,'a'))
341 self.assert_('a' not in ('x',1,'y'))
Walter Dörwald28256f22003-01-19 16:59:20 +0000342 self.assert_('a' not in ('x',1,None))
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000343 self.assert_('abcd' not in 'abcxxxx')
344 self.assert_('ab' in 'abcd')
345 self.assert_('ab' in 'abc')
346 self.assert_('ab' in 'abc')
347 self.assert_('ab' in (1,None,'ab'))
348 self.assert_('' in 'abc')
349 self.assert_('' in 'abc')
Walter Dörwald28256f22003-01-19 16:59:20 +0000350
351 # If the following fails either
352 # the contains operator does not propagate UnicodeErrors or
353 # someone has changed the default encoding
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000354 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, '\xe2')
Walter Dörwald28256f22003-01-19 16:59:20 +0000355
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000356 self.assert_('' in '')
357 self.assert_('' in '')
358 self.assert_('' in '')
359 self.assert_('' in 'abc')
360 self.assert_('' in 'abc')
361 self.assert_('' in 'abc')
362 self.assert_('\0' not in 'abc')
363 self.assert_('\0' not in 'abc')
364 self.assert_('\0' not in 'abc')
365 self.assert_('\0' in '\0abc')
366 self.assert_('\0' in '\0abc')
367 self.assert_('\0' in '\0abc')
368 self.assert_('\0' in 'abc\0')
369 self.assert_('\0' in 'abc\0')
370 self.assert_('\0' in 'abc\0')
371 self.assert_('a' in '\0abc')
372 self.assert_('a' in '\0abc')
373 self.assert_('a' in '\0abc')
374 self.assert_('asdf' in 'asdf')
375 self.assert_('asdf' in 'asdf')
376 self.assert_('asdf' in 'asdf')
377 self.assert_('asdf' not in 'asd')
378 self.assert_('asdf' not in 'asd')
379 self.assert_('asdf' not in 'asd')
380 self.assert_('asdf' not in '')
381 self.assert_('asdf' not in '')
382 self.assert_('asdf' not in '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000383
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000384 self.assertRaises(TypeError, "abc".__contains__)
Walter Dörwald28256f22003-01-19 16:59:20 +0000385
386 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000387 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000388 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000389 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
390 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
391 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
392 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
393 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
394 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000395 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000396 self.assertEqual("%r, %r" % ("abc", "abc"), "u'abc', 'abc'")
397 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
398 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000399
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000400 self.assertEqual('%c' % 0x1234, '\u1234')
401 self.assertRaises(OverflowError, "%c".__mod__, (sys.maxunicode+1,))
Walter Dörwald28256f22003-01-19 16:59:20 +0000402
403 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +0000404 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000405 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
406 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
407 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
408 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
409 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
410 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
411 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
412 self.assertEqual('...%s...' % "abc", '...abc...')
413 self.assertEqual('%*s' % (5,'abc',), ' abc')
414 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
415 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
416 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
417 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
418 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
419 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000420 class Wrapper:
421 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000422 return '\u1234'
423 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +0000424
Thomas Wouters477c8d52006-05-27 19:21:47 +0000425 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000426 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000427 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000428 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +0000429
Walter Dörwald28256f22003-01-19 16:59:20 +0000430 def test_constructor(self):
431 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
432
433 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000434 str('unicode remains unicode'),
435 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +0000436 )
437
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000438 class UnicodeSubclass(str):
Marc-André Lemburg79f57832002-12-29 19:44:06 +0000439 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +0000440
Walter Dörwald28256f22003-01-19 16:59:20 +0000441 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000442 str(UnicodeSubclass('unicode subclass becomes unicode')),
443 'unicode subclass becomes unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +0000444 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000445
Walter Dörwald28256f22003-01-19 16:59:20 +0000446 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000447 str('strings are converted to unicode'),
448 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +0000449 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000450
Walter Dörwald28256f22003-01-19 16:59:20 +0000451 class UnicodeCompat:
452 def __init__(self, x):
453 self.x = x
454 def __unicode__(self):
455 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000456
Walter Dörwald28256f22003-01-19 16:59:20 +0000457 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000458 str(UnicodeCompat('__unicode__ compatible objects are recognized')),
459 '__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000460
Walter Dörwald28256f22003-01-19 16:59:20 +0000461 class StringCompat:
462 def __init__(self, x):
463 self.x = x
464 def __str__(self):
465 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000466
Walter Dörwald28256f22003-01-19 16:59:20 +0000467 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000468 str(StringCompat('__str__ compatible objects are recognized')),
469 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +0000470 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000471
Walter Dörwald28256f22003-01-19 16:59:20 +0000472 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000473
Walter Dörwald28256f22003-01-19 16:59:20 +0000474 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000475 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +0000476 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000477
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000478 # %-formatting and .__unicode__()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000479 self.assertEqual('%s' %
480 UnicodeCompat("u'%s' % obj uses obj.__unicode__()"),
481 "u'%s' % obj uses obj.__unicode__()")
482 self.assertEqual('%s' %
483 UnicodeCompat("u'%s' % obj falls back to obj.__str__()"),
484 "u'%s' % obj falls back to obj.__str__()")
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000485
Guido van Rossume2a383d2007-01-15 16:59:06 +0000486 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000487 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000488
Walter Dörwald28256f22003-01-19 16:59:20 +0000489 # unicode(obj, encoding, error) tests (this maps to
490 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000491
Walter Dörwald28256f22003-01-19 16:59:20 +0000492 if not sys.platform.startswith('java'):
493 self.assertRaises(
494 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000495 str,
496 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +0000497 'utf-8',
498 'strict'
499 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000500
Walter Dörwald28256f22003-01-19 16:59:20 +0000501 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000502 str('strings are decoded to unicode', 'utf-8', 'strict'),
503 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +0000504 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000505
Walter Dörwald28256f22003-01-19 16:59:20 +0000506 if not sys.platform.startswith('java'):
507 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000508 str(
Walter Dörwald28256f22003-01-19 16:59:20 +0000509 buffer('character buffers are decoded to unicode'),
510 'utf-8',
511 'strict'
512 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000513 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +0000514 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000515
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000516 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000517
Walter Dörwald28256f22003-01-19 16:59:20 +0000518 def test_codecs_utf7(self):
519 utfTests = [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000520 ('A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
521 ('Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
522 ('\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
523 ('Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
524 ('+', '+-'),
525 ('+-', '+--'),
526 ('+?', '+-?'),
527 ('\?', '+AFw?'),
528 ('+?', '+-?'),
529 (r'\\?', '+AFwAXA?'),
530 (r'\\\?', '+AFwAXABc?'),
531 (r'++--', '+-+---')
Walter Dörwald28256f22003-01-19 16:59:20 +0000532 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000533
Walter Dörwald28256f22003-01-19 16:59:20 +0000534 for (x, y) in utfTests:
535 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000536
Walter Dörwald28256f22003-01-19 16:59:20 +0000537 # surrogates not supported
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000538 self.assertRaises(UnicodeError, str, '+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000539
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000540 self.assertEqual(str('+3ADYAA-', 'utf-7', 'replace'), '\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000541
Walter Dörwald28256f22003-01-19 16:59:20 +0000542 def test_codecs_utf8(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000543 self.assertEqual(''.encode('utf-8'), '')
544 self.assertEqual('\u20ac'.encode('utf-8'), '\xe2\x82\xac')
545 self.assertEqual('\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
546 self.assertEqual('\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
547 self.assertEqual('\ud800'.encode('utf-8'), '\xed\xa0\x80')
548 self.assertEqual('\udc00'.encode('utf-8'), '\xed\xb0\x80')
Walter Dörwald28256f22003-01-19 16:59:20 +0000549 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000550 ('\ud800\udc02'*1000).encode('utf-8'),
Walter Dörwald28256f22003-01-19 16:59:20 +0000551 '\xf0\x90\x80\x82'*1000
552 )
553 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000554 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
555 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
556 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
557 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
558 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
559 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald28256f22003-01-19 16:59:20 +0000560 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
561 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
562 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
563 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
564 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
565 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
566 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
567 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
568 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
569 '\xe3\x80\x8cWenn ist das Nunstuck git und'
570 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000571
Walter Dörwald28256f22003-01-19 16:59:20 +0000572 # UTF-8 specific decoding tests
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000573 self.assertEqual(str('\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
574 self.assertEqual(str('\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
575 self.assertEqual(str('\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000576
Walter Dörwald28256f22003-01-19 16:59:20 +0000577 # Other possible utf-8 test cases:
578 # * strict decoding testing for all of the
579 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000580
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000581 def test_codecs_idna(self):
582 # Test whether trailing dot is preserved
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000583 self.assertEqual("www.python.org.".encode("idna"), "www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000584
Walter Dörwald28256f22003-01-19 16:59:20 +0000585 def test_codecs_errors(self):
586 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
588 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
589 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), "Andr x")
590 self.assertEqual('Andr\202 x'.encode('ascii','replace'), "Andr? x")
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000591
Walter Dörwald28256f22003-01-19 16:59:20 +0000592 # Error handling (decoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000593 self.assertRaises(UnicodeError, str, 'Andr\202 x', 'ascii')
594 self.assertRaises(UnicodeError, str, 'Andr\202 x', 'ascii','strict')
595 self.assertEqual(str('Andr\202 x','ascii','ignore'), "Andr x")
596 self.assertEqual(str('Andr\202 x','ascii','replace'), 'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000597
Walter Dörwald28256f22003-01-19 16:59:20 +0000598 # Error handling (unknown character names)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000599 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000600
Walter Dörwald28256f22003-01-19 16:59:20 +0000601 # Error handling (truncated escape sequence)
602 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000603
Walter Dörwald28256f22003-01-19 16:59:20 +0000604 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000605 self.assertRaises(TypeError, str, "hello", "test.unicode2")
606 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
607 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Walter Dörwald28256f22003-01-19 16:59:20 +0000608 # executes PyUnicode_Encode()
609 import imp
610 self.assertRaises(
611 ImportError,
612 imp.find_module,
613 "non-existing module",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000614 ["non-existing dir"]
Walter Dörwald28256f22003-01-19 16:59:20 +0000615 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000616
Walter Dörwald28256f22003-01-19 16:59:20 +0000617 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000618 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000619
Walter Dörwald28256f22003-01-19 16:59:20 +0000620 # Error handling (PyUnicode_EncodeDecimal())
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000621 self.assertRaises(UnicodeError, int, "\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +0000622
Walter Dörwald28256f22003-01-19 16:59:20 +0000623 def test_codecs(self):
624 # Encoding
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000625 self.assertEqual('hello'.encode('ascii'), 'hello')
626 self.assertEqual('hello'.encode('utf-7'), 'hello')
627 self.assertEqual('hello'.encode('utf-8'), 'hello')
628 self.assertEqual('hello'.encode('utf8'), 'hello')
629 self.assertEqual('hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
630 self.assertEqual('hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
631 self.assertEqual('hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +0000632
Walter Dörwald28256f22003-01-19 16:59:20 +0000633 # Roundtrip safety for BMP (just the first 1024 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000634 for c in xrange(1024):
635 u = unichr(c)
636 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
637 'utf-16-be', 'raw_unicode_escape',
638 'unicode_escape', 'unicode_internal'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000639 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +0000640
Walter Dörwald28256f22003-01-19 16:59:20 +0000641 # Roundtrip safety for BMP (just the first 256 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000642 for c in xrange(256):
643 u = unichr(c)
644 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000645 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000646
Walter Dörwald28256f22003-01-19 16:59:20 +0000647 # Roundtrip safety for BMP (just the first 128 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000648 for c in xrange(128):
649 u = unichr(c)
650 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000651 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000652
Walter Dörwald28256f22003-01-19 16:59:20 +0000653 # Roundtrip safety for non-BMP (just a few chars)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000654 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
Walter Dörwald28256f22003-01-19 16:59:20 +0000655 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
656 #'raw_unicode_escape',
657 'unicode_escape', 'unicode_internal'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000658 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000659
Walter Dörwald28256f22003-01-19 16:59:20 +0000660 # UTF-8 must be roundtrip safe for all UCS-2 code points
661 # This excludes surrogates: in the full range, there would be
662 # a surrogate pair (\udbff\udc00), which gets converted back
663 # to a non-BMP character (\U0010fc00)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000664 u = ''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
Walter Dörwald28256f22003-01-19 16:59:20 +0000665 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000666 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000667
Walter Dörwald28256f22003-01-19 16:59:20 +0000668 def test_codecs_charmap(self):
669 # 0-127
670 s = ''.join(map(chr, xrange(128)))
671 for encoding in (
672 'cp037', 'cp1026',
673 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
674 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
675 'cp863', 'cp865', 'cp866',
676 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
677 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
678 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
679 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000680
Walter Dörwald28256f22003-01-19 16:59:20 +0000681 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
682 'cp1256', 'cp1257', 'cp1258',
683 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000684
Walter Dörwald28256f22003-01-19 16:59:20 +0000685 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
686 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000687
Walter Dörwald28256f22003-01-19 16:59:20 +0000688 ### These have undefined mappings:
689 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000690
Walter Dörwald28256f22003-01-19 16:59:20 +0000691 ### These fail the round-trip:
692 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +0000693
Walter Dörwald28256f22003-01-19 16:59:20 +0000694 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000695 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000696
Walter Dörwald28256f22003-01-19 16:59:20 +0000697 # 128-255
698 s = ''.join(map(chr, xrange(128, 256)))
699 for encoding in (
700 'cp037', 'cp1026',
701 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
702 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
703 'cp863', 'cp865', 'cp866',
704 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
705 'iso8859_2', 'iso8859_4', 'iso8859_5',
706 'iso8859_9', 'koi8_r', 'latin_1',
707 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000708
Walter Dörwald28256f22003-01-19 16:59:20 +0000709 ### These have undefined mappings:
710 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
711 #'cp1256', 'cp1257', 'cp1258',
712 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
713 #'iso8859_3', 'iso8859_6', 'iso8859_7',
714 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000715
Walter Dörwald28256f22003-01-19 16:59:20 +0000716 ### These fail the round-trip:
717 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +0000718
Walter Dörwald28256f22003-01-19 16:59:20 +0000719 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000720 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000721
Walter Dörwald28256f22003-01-19 16:59:20 +0000722 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000723 self.assertEqual(("abc" "def"), "abcdef")
724 self.assertEqual(("abc" "def"), "abcdef")
725 self.assertEqual(("abc" "def"), "abcdef")
726 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
727 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +0000728
Walter Dörwald28256f22003-01-19 16:59:20 +0000729 def test_printing(self):
730 class BitBucket:
731 def write(self, text):
732 pass
Fred Drake004d5e62000-10-23 17:22:08 +0000733
Walter Dörwald28256f22003-01-19 16:59:20 +0000734 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000735 print('abc', file=out)
736 print('abc', 'def', file=out)
737 print('abc', 'def', file=out)
738 print('abc', 'def', file=out)
739 print('abc\n', file=out)
740 print('abc\n', end=' ', file=out)
741 print('abc\n', end=' ', file=out)
742 print('def\n', file=out)
743 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +0000744
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +0000745 def test_ucs4(self):
746 if sys.maxunicode == 0xFFFF:
747 return
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000748 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +0000749 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
750 self.assertEqual(x, y)
751
Brett Cannonc3647ac2005-04-26 03:45:26 +0000752 def test_conversion(self):
753 # Make sure __unicode__() works properly
754 class Foo0:
755 def __str__(self):
756 return "foo"
757
758 class Foo1:
759 def __unicode__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000760 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +0000761
762 class Foo2(object):
763 def __unicode__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000764 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +0000765
766 class Foo3(object):
767 def __unicode__(self):
768 return "foo"
769
770 class Foo4(str):
771 def __unicode__(self):
772 return "foo"
773
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000774 class Foo5(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +0000775 def __unicode__(self):
776 return "foo"
777
778 class Foo6(str):
779 def __str__(self):
780 return "foos"
781
782 def __unicode__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000783 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +0000784
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000785 class Foo7(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +0000786 def __str__(self):
787 return "foos"
788 def __unicode__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000789 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +0000790
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000791 class Foo8(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +0000792 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000793 return str.__new__(cls, 2*content)
Brett Cannonc3647ac2005-04-26 03:45:26 +0000794 def __unicode__(self):
795 return self
796
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000797 class Foo9(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +0000798 def __str__(self):
799 return "string"
800 def __unicode__(self):
801 return "not unicode"
802
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000803 self.assertEqual(str(Foo0()), "foo")
804 self.assertEqual(str(Foo1()), "foo")
805 self.assertEqual(str(Foo2()), "foo")
806 self.assertEqual(str(Foo3()), "foo")
807 self.assertEqual(str(Foo4("bar")), "foo")
808 self.assertEqual(str(Foo5("bar")), "foo")
809 self.assertEqual(str(Foo6("bar")), "foou")
810 self.assertEqual(str(Foo7("bar")), "foou")
811 self.assertEqual(str(Foo8("foo")), "foofoo")
Brett Cannonc3647ac2005-04-26 03:45:26 +0000812 self.assertEqual(str(Foo9("foo")), "string")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000813 self.assertEqual(str(Foo9("foo")), "not unicode")
Brett Cannonc3647ac2005-04-26 03:45:26 +0000814
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000815 def test_unicode_repr(self):
816 class s1:
817 def __repr__(self):
818 return '\\n'
819
820 class s2:
821 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000822 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000823
824 self.assertEqual(repr(s1()), '\\n')
825 self.assertEqual(repr(s2()), '\\n')
826
827
828
829
830
Walter Dörwald28256f22003-01-19 16:59:20 +0000831def test_main():
Guido van Rossumd8faa362007-04-27 19:54:29 +0000832 test_support.run_unittest(__name__)
Barry Warsaw817918c2002-08-06 16:58:21 +0000833
Walter Dörwald28256f22003-01-19 16:59:20 +0000834if __name__ == "__main__":
835 test_main()