blob: ccfa92207e3317336499a02ef444a904b3b14ac1 [file] [log] [blame]
Martin v. Löwisa729daf2002-08-04 17:28:33 +00001# -*- coding: iso-8859-1 -*-
Guido van Rossuma831cac2000-03-10 23:23:21 +00002""" Test script for the Unicode implementation.
3
Guido van Rossuma831cac2000-03-10 23:23:21 +00004Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
Marc-André Lemburg36619082001-01-17 19:11:13 +00008"""#"
Neal Norwitz9d72bb42007-04-17 08:48:32 +00009import unittest, sys, codecs, new
Walter Dörwald0fd583c2003-02-21 12:53:50 +000010from test import test_support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000011
Neal Norwitz430f68b2005-11-24 22:00:56 +000012# Error handling (bad decoder return)
13def search_function(encoding):
14 def decode1(input, errors="strict"):
15 return 42 # not a tuple
16 def encode1(input, errors="strict"):
17 return 42 # not a tuple
18 def encode2(input, errors="strict"):
19 return (42, 42) # no unicode
20 def decode2(input, errors="strict"):
21 return (42, 42) # no unicode
22 if encoding=="test.unicode1":
23 return (encode1, decode1, None, None)
24 elif encoding=="test.unicode2":
25 return (encode2, decode2, None, None)
26 else:
27 return None
28codecs.register(search_function)
29
Walter Dörwald0fd583c2003-02-21 12:53:50 +000030class UnicodeTest(
31 string_tests.CommonTest,
Walter Dörwald57d88e52004-08-26 16:53:04 +000032 string_tests.MixinStrUnicodeUserStringTest,
33 string_tests.MixinStrUnicodeTest,
Walter Dörwald0fd583c2003-02-21 12:53:50 +000034 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000035 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000036
37 def checkequalnofix(self, result, object, methodname, *args):
38 method = getattr(object, methodname)
39 realresult = method(*args)
40 self.assertEqual(realresult, result)
41 self.assert_(type(realresult) is type(result))
42
43 # if the original is returned make sure that
44 # this doesn't happen with subclasses
45 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000046 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000047 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000048 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000049 object = usub(object)
50 method = getattr(object, methodname)
51 realresult = method(*args)
52 self.assertEqual(realresult, result)
53 self.assert_(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000054
Jeremy Hylton504de6b2003-10-06 05:08:26 +000055 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000056 self.assertEqual('\xff', '\u00ff')
57 self.assertEqual('\uffff', '\U0000ffff')
Jeremy Hylton504de6b2003-10-06 05:08:26 +000058 self.assertRaises(UnicodeError, eval, 'u\'\\Ufffffffe\'')
59 self.assertRaises(UnicodeError, eval, 'u\'\\Uffffffff\'')
60 self.assertRaises(UnicodeError, eval, 'u\'\\U%08x\'' % 0x110000)
61
Walter Dörwald28256f22003-01-19 16:59:20 +000062 def test_repr(self):
63 if not sys.platform.startswith('java'):
64 # Test basic sanity of repr()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000065 self.assertEqual(repr('abc'), "u'abc'")
66 self.assertEqual(repr('ab\\c'), "u'ab\\\\c'")
67 self.assertEqual(repr('ab\\'), "u'ab\\\\'")
68 self.assertEqual(repr('\\c'), "u'\\\\c'")
69 self.assertEqual(repr('\\'), "u'\\\\'")
70 self.assertEqual(repr('\n'), "u'\\n'")
71 self.assertEqual(repr('\r'), "u'\\r'")
72 self.assertEqual(repr('\t'), "u'\\t'")
73 self.assertEqual(repr('\b'), "u'\\x08'")
74 self.assertEqual(repr("'\""), """u'\\'"'""")
75 self.assertEqual(repr("'\""), """u'\\'"'""")
76 self.assertEqual(repr("'"), '''u"'"''')
77 self.assertEqual(repr('"'), """u'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +000078 latin1repr = (
79 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
80 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
81 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
82 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
83 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
84 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
85 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
86 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
87 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
88 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
89 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
90 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
91 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
92 "\\xfe\\xff'")
Guido van Rossum84fc66d2007-05-03 17:18:26 +000093 testrepr = repr(''.join(map(chr, xrange(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +000094 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +000095 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
97 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +000098
Guido van Rossum49d6b072006-08-17 21:11:47 +000099 def test_iterators(self):
100 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000101 it = "\u1111\u2222\u3333".__iter__()
102 self.assertEqual(next(it), "\u1111")
103 self.assertEqual(next(it), "\u2222")
104 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000105 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000106
Walter Dörwald28256f22003-01-19 16:59:20 +0000107 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000108 string_tests.CommonTest.test_count(self)
109 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000110 self.checkequalnofix(3, 'aaa', 'count', 'a')
111 self.checkequalnofix(0, 'aaa', 'count', 'b')
112 self.checkequalnofix(3, 'aaa', 'count', 'a')
113 self.checkequalnofix(0, 'aaa', 'count', 'b')
114 self.checkequalnofix(0, 'aaa', 'count', 'b')
115 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
116 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
117 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
118 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000119
Walter Dörwald28256f22003-01-19 16:59:20 +0000120 def test_find(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000121 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
122 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
123 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000124
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000125 self.assertRaises(TypeError, 'hello'.find)
126 self.assertRaises(TypeError, 'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000127
Walter Dörwald28256f22003-01-19 16:59:20 +0000128 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000129 string_tests.CommonTest.test_rfind(self)
130 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000131 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
132 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
133 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000134
Walter Dörwald28256f22003-01-19 16:59:20 +0000135 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000136 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000137 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
138 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
139 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
140 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
141 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
142 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
143 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
144 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000145
Walter Dörwald28256f22003-01-19 16:59:20 +0000146 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000147 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000148 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
149 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
150 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
151 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000152
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000153 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
154 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
155 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
156 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
157 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000158
Walter Dörwald28256f22003-01-19 16:59:20 +0000159 def test_translate(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000160 self.checkequalnofix('bbbc', 'abababc', 'translate', {ord('a'):None})
161 self.checkequalnofix('iiic', 'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
162 self.checkequalnofix('iiix', 'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):'x'})
163 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', {ord('a'):None, ord('b'):'<i>'})
164 self.checkequalnofix('c', 'abababc', 'translate', {ord('a'):None, ord('b'):''})
165 self.checkequalnofix('xyyx', 'xzx', 'translate', {ord('z'):'yy'})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000166
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 self.assertRaises(TypeError, 'hello'.translate)
168 self.assertRaises(TypeError, 'abababc'.translate, {ord('a'):''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000169
Walter Dörwald28256f22003-01-19 16:59:20 +0000170 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000171 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000172
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000173 # Mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000174 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
175 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
176 self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000177
Walter Dörwald28256f22003-01-19 16:59:20 +0000178 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000179 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000180
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000181 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000182 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
183 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
184 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
185 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
186 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
187 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
188 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Marc-André Lemburge5034372000-08-08 08:04:29 +0000189
Walter Dörwald28256f22003-01-19 16:59:20 +0000190 def test_strip(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000191 string_tests.CommonTest.test_strip(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000192 self.assertRaises(UnicodeError, "hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000193
Walter Dörwald28256f22003-01-19 16:59:20 +0000194 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000195 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000196
Walter Dörwald28256f22003-01-19 16:59:20 +0000197 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000198 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
199 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000200
Walter Dörwald28256f22003-01-19 16:59:20 +0000201 def test_comparison(self):
202 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000203 self.assertEqual('abc', 'abc')
204 self.assertEqual('abc', 'abc')
205 self.assertEqual('abc', 'abc')
206 self.assert_('abcd' > 'abc')
207 self.assert_('abcd' > 'abc')
208 self.assert_('abcd' > 'abc')
209 self.assert_('abc' < 'abcd')
210 self.assert_('abc' < 'abcd')
211 self.assert_('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000212
213 if 0:
214 # Move these tests to a Unicode collation module test...
215 # Testing UTF-16 code point order comparisons...
216
217 # No surrogates, no fixup required.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000218 self.assert_('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000219 # Non surrogate below surrogate value, no fixup required
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000220 self.assert_('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000221
222 # Non surrogate above surrogate value, fixup required
223 def test_lecmp(s, s2):
224 self.assert_(s < s2)
225
226 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000227 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000228 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000230 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000231 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000232 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000233 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000234 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000235 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000236 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000237 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000238 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000240 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000241 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000242 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000243 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000244 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000245 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000246 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000247 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000248 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000249 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000250 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000252 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000253 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000254 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000255 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000256 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000257 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000258 test_lecmp(s, s2)
259
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000260 test_fixup('\ue000')
261 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000262
263 # Surrogates on both sides, no fixup required
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 self.assert_('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000265
Walter Dörwald28256f22003-01-19 16:59:20 +0000266 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000267 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000268 self.checkequalnofix(False, '\u1FFc', 'islower')
Walter Dörwald28256f22003-01-19 16:59:20 +0000269
270 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000271 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
272 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000273 self.checkequalnofix(False, '\u1FFc', 'isupper')
Walter Dörwald28256f22003-01-19 16:59:20 +0000274
275 def test_istitle(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000276 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000277 self.checkequalnofix(True, '\u1FFc', 'istitle')
278 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000279
280 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000281 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000282 self.checkequalnofix(True, '\u2000', 'isspace')
283 self.checkequalnofix(True, '\u200a', 'isspace')
284 self.checkequalnofix(False, '\u2014', 'isspace')
Walter Dörwald28256f22003-01-19 16:59:20 +0000285
286 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000287 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000288 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Walter Dörwald28256f22003-01-19 16:59:20 +0000289
290 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000291 self.checkequalnofix(False, '', 'isdecimal')
292 self.checkequalnofix(False, 'a', 'isdecimal')
293 self.checkequalnofix(True, '0', 'isdecimal')
294 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
295 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
296 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
297 self.checkequalnofix(True, '0123456789', 'isdecimal')
298 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000299
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000300 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000301
302 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000303 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000304 self.checkequalnofix(True, '\u2460', 'isdigit')
305 self.checkequalnofix(False, '\xbc', 'isdigit')
306 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000307
308 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000309 self.checkequalnofix(False, '', 'isnumeric')
310 self.checkequalnofix(False, 'a', 'isnumeric')
311 self.checkequalnofix(True, '0', 'isnumeric')
312 self.checkequalnofix(True, '\u2460', 'isnumeric')
313 self.checkequalnofix(True, '\xbc', 'isnumeric')
314 self.checkequalnofix(True, '\u0660', 'isnumeric')
315 self.checkequalnofix(True, '0123456789', 'isnumeric')
316 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000317
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000318 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000319
Walter Dörwald28256f22003-01-19 16:59:20 +0000320 def test_contains(self):
321 # Testing Unicode contains method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000322 self.assert_('a' in 'abdb')
323 self.assert_('a' in 'bdab')
324 self.assert_('a' in 'bdaba')
325 self.assert_('a' in 'bdba')
326 self.assert_('a' in 'bdba')
327 self.assert_('a' in 'bdba')
328 self.assert_('a' not in 'bdb')
329 self.assert_('a' not in 'bdb')
330 self.assert_('a' in 'bdba')
Walter Dörwald28256f22003-01-19 16:59:20 +0000331 self.assert_('a' in ('a',1,None))
332 self.assert_('a' in (1,None,'a'))
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000333 self.assert_('a' in (1,None,'a'))
334 self.assert_('a' in ('a',1,None))
335 self.assert_('a' in (1,None,'a'))
336 self.assert_('a' in (1,None,'a'))
337 self.assert_('a' not in ('x',1,'y'))
Walter Dörwald28256f22003-01-19 16:59:20 +0000338 self.assert_('a' not in ('x',1,None))
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000339 self.assert_('abcd' not in 'abcxxxx')
340 self.assert_('ab' in 'abcd')
341 self.assert_('ab' in 'abc')
342 self.assert_('ab' in 'abc')
343 self.assert_('ab' in (1,None,'ab'))
344 self.assert_('' in 'abc')
345 self.assert_('' in 'abc')
Walter Dörwald28256f22003-01-19 16:59:20 +0000346
347 # If the following fails either
348 # the contains operator does not propagate UnicodeErrors or
349 # someone has changed the default encoding
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000350 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, '\xe2')
Walter Dörwald28256f22003-01-19 16:59:20 +0000351
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000352 self.assert_('' in '')
353 self.assert_('' in '')
354 self.assert_('' in '')
355 self.assert_('' in 'abc')
356 self.assert_('' in 'abc')
357 self.assert_('' in 'abc')
358 self.assert_('\0' not in 'abc')
359 self.assert_('\0' not in 'abc')
360 self.assert_('\0' not in 'abc')
361 self.assert_('\0' in '\0abc')
362 self.assert_('\0' in '\0abc')
363 self.assert_('\0' in '\0abc')
364 self.assert_('\0' in 'abc\0')
365 self.assert_('\0' in 'abc\0')
366 self.assert_('\0' in 'abc\0')
367 self.assert_('a' in '\0abc')
368 self.assert_('a' in '\0abc')
369 self.assert_('a' in '\0abc')
370 self.assert_('asdf' in 'asdf')
371 self.assert_('asdf' in 'asdf')
372 self.assert_('asdf' in 'asdf')
373 self.assert_('asdf' not in 'asd')
374 self.assert_('asdf' not in 'asd')
375 self.assert_('asdf' not in 'asd')
376 self.assert_('asdf' not in '')
377 self.assert_('asdf' not in '')
378 self.assert_('asdf' not in '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000379
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000380 self.assertRaises(TypeError, "abc".__contains__)
Walter Dörwald28256f22003-01-19 16:59:20 +0000381
382 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000383 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000384 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000385 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
386 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
387 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
388 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
389 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
390 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000391 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000392 self.assertEqual("%r, %r" % ("abc", "abc"), "u'abc', 'abc'")
393 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
394 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000395
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000396 self.assertEqual('%c' % 0x1234, '\u1234')
397 self.assertRaises(OverflowError, "%c".__mod__, (sys.maxunicode+1,))
Walter Dörwald28256f22003-01-19 16:59:20 +0000398
399 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +0000400 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000401 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
402 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
403 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
404 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
405 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
406 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
407 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
408 self.assertEqual('...%s...' % "abc", '...abc...')
409 self.assertEqual('%*s' % (5,'abc',), ' abc')
410 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
411 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
412 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
413 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
414 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
415 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000416 class Wrapper:
417 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000418 return '\u1234'
419 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +0000420
Thomas Wouters477c8d52006-05-27 19:21:47 +0000421 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000422 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000423 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000424 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +0000425
Walter Dörwald28256f22003-01-19 16:59:20 +0000426 def test_constructor(self):
427 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
428
429 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000430 str('unicode remains unicode'),
431 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +0000432 )
433
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000434 class UnicodeSubclass(str):
Marc-André Lemburg79f57832002-12-29 19:44:06 +0000435 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +0000436
Walter Dörwald28256f22003-01-19 16:59:20 +0000437 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000438 str(UnicodeSubclass('unicode subclass becomes unicode')),
439 'unicode subclass becomes unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +0000440 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000441
Walter Dörwald28256f22003-01-19 16:59:20 +0000442 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000443 str('strings are converted to unicode'),
444 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +0000445 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000446
Walter Dörwald28256f22003-01-19 16:59:20 +0000447 class UnicodeCompat:
448 def __init__(self, x):
449 self.x = x
450 def __unicode__(self):
451 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000452
Walter Dörwald28256f22003-01-19 16:59:20 +0000453 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000454 str(UnicodeCompat('__unicode__ compatible objects are recognized')),
455 '__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000456
Walter Dörwald28256f22003-01-19 16:59:20 +0000457 class StringCompat:
458 def __init__(self, x):
459 self.x = x
460 def __str__(self):
461 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000462
Walter Dörwald28256f22003-01-19 16:59:20 +0000463 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000464 str(StringCompat('__str__ compatible objects are recognized')),
465 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +0000466 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000467
Walter Dörwald28256f22003-01-19 16:59:20 +0000468 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000469
Walter Dörwald28256f22003-01-19 16:59:20 +0000470 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000471 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +0000472 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000473
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000474 # %-formatting and .__unicode__()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000475 self.assertEqual('%s' %
476 UnicodeCompat("u'%s' % obj uses obj.__unicode__()"),
477 "u'%s' % obj uses obj.__unicode__()")
478 self.assertEqual('%s' %
479 UnicodeCompat("u'%s' % obj falls back to obj.__str__()"),
480 "u'%s' % obj falls back to obj.__str__()")
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000481
Guido van Rossume2a383d2007-01-15 16:59:06 +0000482 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000483 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000484
Walter Dörwald28256f22003-01-19 16:59:20 +0000485 # unicode(obj, encoding, error) tests (this maps to
486 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000487
Walter Dörwald28256f22003-01-19 16:59:20 +0000488 if not sys.platform.startswith('java'):
489 self.assertRaises(
490 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000491 str,
492 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +0000493 'utf-8',
494 'strict'
495 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000496
Walter Dörwald28256f22003-01-19 16:59:20 +0000497 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000498 str('strings are decoded to unicode', 'utf-8', 'strict'),
499 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +0000500 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000501
Walter Dörwald28256f22003-01-19 16:59:20 +0000502 if not sys.platform.startswith('java'):
503 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000504 str(
Walter Dörwald28256f22003-01-19 16:59:20 +0000505 buffer('character buffers are decoded to unicode'),
506 'utf-8',
507 'strict'
508 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000509 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +0000510 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000511
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000512 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000513
Walter Dörwald28256f22003-01-19 16:59:20 +0000514 def test_codecs_utf7(self):
515 utfTests = [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000516 ('A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
517 ('Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
518 ('\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
519 ('Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
520 ('+', '+-'),
521 ('+-', '+--'),
522 ('+?', '+-?'),
523 ('\?', '+AFw?'),
524 ('+?', '+-?'),
525 (r'\\?', '+AFwAXA?'),
526 (r'\\\?', '+AFwAXABc?'),
527 (r'++--', '+-+---')
Walter Dörwald28256f22003-01-19 16:59:20 +0000528 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000529
Walter Dörwald28256f22003-01-19 16:59:20 +0000530 for (x, y) in utfTests:
531 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000532
Walter Dörwald28256f22003-01-19 16:59:20 +0000533 # surrogates not supported
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000534 self.assertRaises(UnicodeError, str, '+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000535
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000536 self.assertEqual(str('+3ADYAA-', 'utf-7', 'replace'), '\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000537
Walter Dörwald28256f22003-01-19 16:59:20 +0000538 def test_codecs_utf8(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000539 self.assertEqual(''.encode('utf-8'), '')
540 self.assertEqual('\u20ac'.encode('utf-8'), '\xe2\x82\xac')
541 self.assertEqual('\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
542 self.assertEqual('\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
543 self.assertEqual('\ud800'.encode('utf-8'), '\xed\xa0\x80')
544 self.assertEqual('\udc00'.encode('utf-8'), '\xed\xb0\x80')
Walter Dörwald28256f22003-01-19 16:59:20 +0000545 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000546 ('\ud800\udc02'*1000).encode('utf-8'),
Walter Dörwald28256f22003-01-19 16:59:20 +0000547 '\xf0\x90\x80\x82'*1000
548 )
549 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000550 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
551 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
552 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
553 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
554 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
555 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald28256f22003-01-19 16:59:20 +0000556 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
557 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
558 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
559 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
560 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
561 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
562 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
563 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
564 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
565 '\xe3\x80\x8cWenn ist das Nunstuck git und'
566 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000567
Walter Dörwald28256f22003-01-19 16:59:20 +0000568 # UTF-8 specific decoding tests
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000569 self.assertEqual(str('\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
570 self.assertEqual(str('\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
571 self.assertEqual(str('\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000572
Walter Dörwald28256f22003-01-19 16:59:20 +0000573 # Other possible utf-8 test cases:
574 # * strict decoding testing for all of the
575 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000576
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000577 def test_codecs_idna(self):
578 # Test whether trailing dot is preserved
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000579 self.assertEqual("www.python.org.".encode("idna"), "www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000580
Walter Dörwald28256f22003-01-19 16:59:20 +0000581 def test_codecs_errors(self):
582 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000583 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
584 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
585 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), "Andr x")
586 self.assertEqual('Andr\202 x'.encode('ascii','replace'), "Andr? x")
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000587
Walter Dörwald28256f22003-01-19 16:59:20 +0000588 # Error handling (decoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000589 self.assertRaises(UnicodeError, str, 'Andr\202 x', 'ascii')
590 self.assertRaises(UnicodeError, str, 'Andr\202 x', 'ascii','strict')
591 self.assertEqual(str('Andr\202 x','ascii','ignore'), "Andr x")
592 self.assertEqual(str('Andr\202 x','ascii','replace'), 'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000593
Walter Dörwald28256f22003-01-19 16:59:20 +0000594 # Error handling (unknown character names)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000595 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000596
Walter Dörwald28256f22003-01-19 16:59:20 +0000597 # Error handling (truncated escape sequence)
598 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000599
Walter Dörwald28256f22003-01-19 16:59:20 +0000600 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000601 self.assertRaises(TypeError, str, "hello", "test.unicode2")
602 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
603 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Walter Dörwald28256f22003-01-19 16:59:20 +0000604 # executes PyUnicode_Encode()
605 import imp
606 self.assertRaises(
607 ImportError,
608 imp.find_module,
609 "non-existing module",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000610 ["non-existing dir"]
Walter Dörwald28256f22003-01-19 16:59:20 +0000611 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000612
Walter Dörwald28256f22003-01-19 16:59:20 +0000613 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000614 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000615
Walter Dörwald28256f22003-01-19 16:59:20 +0000616 # Error handling (PyUnicode_EncodeDecimal())
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000617 self.assertRaises(UnicodeError, int, "\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +0000618
Walter Dörwald28256f22003-01-19 16:59:20 +0000619 def test_codecs(self):
620 # Encoding
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000621 self.assertEqual('hello'.encode('ascii'), 'hello')
622 self.assertEqual('hello'.encode('utf-7'), 'hello')
623 self.assertEqual('hello'.encode('utf-8'), 'hello')
624 self.assertEqual('hello'.encode('utf8'), 'hello')
625 self.assertEqual('hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
626 self.assertEqual('hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
627 self.assertEqual('hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +0000628
Walter Dörwald28256f22003-01-19 16:59:20 +0000629 # Roundtrip safety for BMP (just the first 1024 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000630 for c in xrange(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000631 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000632 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
633 'utf-16-be', 'raw_unicode_escape',
634 'unicode_escape', 'unicode_internal'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000635 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +0000636
Walter Dörwald28256f22003-01-19 16:59:20 +0000637 # Roundtrip safety for BMP (just the first 256 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000638 for c in xrange(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000639 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000640 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000641 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000642
Walter Dörwald28256f22003-01-19 16:59:20 +0000643 # Roundtrip safety for BMP (just the first 128 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000644 for c in xrange(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000645 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000646 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000647 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000648
Walter Dörwald28256f22003-01-19 16:59:20 +0000649 # Roundtrip safety for non-BMP (just a few chars)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000650 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
Walter Dörwald28256f22003-01-19 16:59:20 +0000651 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
652 #'raw_unicode_escape',
653 'unicode_escape', 'unicode_internal'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000654 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000655
Walter Dörwald28256f22003-01-19 16:59:20 +0000656 # UTF-8 must be roundtrip safe for all UCS-2 code points
657 # This excludes surrogates: in the full range, there would be
658 # a surrogate pair (\udbff\udc00), which gets converted back
659 # to a non-BMP character (\U0010fc00)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000660 u = ''.join(map(chr, range(0,0xd800)+range(0xe000,0x10000)))
Walter Dörwald28256f22003-01-19 16:59:20 +0000661 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000663
Walter Dörwald28256f22003-01-19 16:59:20 +0000664 def test_codecs_charmap(self):
665 # 0-127
666 s = ''.join(map(chr, xrange(128)))
667 for encoding in (
668 'cp037', 'cp1026',
669 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
670 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
671 'cp863', 'cp865', 'cp866',
672 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
673 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
674 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
675 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000676
Walter Dörwald28256f22003-01-19 16:59:20 +0000677 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
678 'cp1256', 'cp1257', 'cp1258',
679 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000680
Walter Dörwald28256f22003-01-19 16:59:20 +0000681 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
682 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000683
Walter Dörwald28256f22003-01-19 16:59:20 +0000684 ### These have undefined mappings:
685 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000686
Walter Dörwald28256f22003-01-19 16:59:20 +0000687 ### These fail the round-trip:
688 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +0000689
Walter Dörwald28256f22003-01-19 16:59:20 +0000690 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000691 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000692
Walter Dörwald28256f22003-01-19 16:59:20 +0000693 # 128-255
694 s = ''.join(map(chr, xrange(128, 256)))
695 for encoding in (
696 'cp037', 'cp1026',
697 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
698 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
699 'cp863', 'cp865', 'cp866',
700 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
701 'iso8859_2', 'iso8859_4', 'iso8859_5',
702 'iso8859_9', 'koi8_r', 'latin_1',
703 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000704
Walter Dörwald28256f22003-01-19 16:59:20 +0000705 ### These have undefined mappings:
706 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
707 #'cp1256', 'cp1257', 'cp1258',
708 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
709 #'iso8859_3', 'iso8859_6', 'iso8859_7',
710 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000711
Walter Dörwald28256f22003-01-19 16:59:20 +0000712 ### These fail the round-trip:
713 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +0000714
Walter Dörwald28256f22003-01-19 16:59:20 +0000715 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000716 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000717
Walter Dörwald28256f22003-01-19 16:59:20 +0000718 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000719 self.assertEqual(("abc" "def"), "abcdef")
720 self.assertEqual(("abc" "def"), "abcdef")
721 self.assertEqual(("abc" "def"), "abcdef")
722 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
723 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +0000724
Walter Dörwald28256f22003-01-19 16:59:20 +0000725 def test_printing(self):
726 class BitBucket:
727 def write(self, text):
728 pass
Fred Drake004d5e62000-10-23 17:22:08 +0000729
Walter Dörwald28256f22003-01-19 16:59:20 +0000730 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000731 print('abc', file=out)
732 print('abc', 'def', file=out)
733 print('abc', 'def', file=out)
734 print('abc', 'def', file=out)
735 print('abc\n', file=out)
736 print('abc\n', end=' ', file=out)
737 print('abc\n', end=' ', file=out)
738 print('def\n', file=out)
739 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +0000740
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +0000741 def test_ucs4(self):
742 if sys.maxunicode == 0xFFFF:
743 return
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000744 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +0000745 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
746 self.assertEqual(x, y)
747
Brett Cannonc3647ac2005-04-26 03:45:26 +0000748 def test_conversion(self):
749 # Make sure __unicode__() works properly
750 class Foo0:
751 def __str__(self):
752 return "foo"
753
754 class Foo1:
755 def __unicode__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000756 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +0000757
758 class Foo2(object):
759 def __unicode__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000760 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +0000761
762 class Foo3(object):
763 def __unicode__(self):
764 return "foo"
765
766 class Foo4(str):
767 def __unicode__(self):
768 return "foo"
769
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000770 class Foo5(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +0000771 def __unicode__(self):
772 return "foo"
773
774 class Foo6(str):
775 def __str__(self):
776 return "foos"
777
778 def __unicode__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000779 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +0000780
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000781 class Foo7(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +0000782 def __str__(self):
783 return "foos"
784 def __unicode__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000785 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +0000786
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000787 class Foo8(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +0000788 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000789 return str.__new__(cls, 2*content)
Brett Cannonc3647ac2005-04-26 03:45:26 +0000790 def __unicode__(self):
791 return self
792
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000793 class Foo9(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +0000794 def __str__(self):
795 return "string"
796 def __unicode__(self):
797 return "not unicode"
798
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000799 self.assertEqual(str(Foo0()), "foo")
800 self.assertEqual(str(Foo1()), "foo")
801 self.assertEqual(str(Foo2()), "foo")
802 self.assertEqual(str(Foo3()), "foo")
803 self.assertEqual(str(Foo4("bar")), "foo")
804 self.assertEqual(str(Foo5("bar")), "foo")
805 self.assertEqual(str(Foo6("bar")), "foou")
806 self.assertEqual(str(Foo7("bar")), "foou")
807 self.assertEqual(str(Foo8("foo")), "foofoo")
Brett Cannonc3647ac2005-04-26 03:45:26 +0000808 self.assertEqual(str(Foo9("foo")), "string")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000809 self.assertEqual(str(Foo9("foo")), "not unicode")
Brett Cannonc3647ac2005-04-26 03:45:26 +0000810
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000811 def test_unicode_repr(self):
812 class s1:
813 def __repr__(self):
814 return '\\n'
815
816 class s2:
817 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000818 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000819
820 self.assertEqual(repr(s1()), '\\n')
821 self.assertEqual(repr(s2()), '\\n')
822
823
824
825
826
Walter Dörwald28256f22003-01-19 16:59:20 +0000827def test_main():
Guido van Rossumd8faa362007-04-27 19:54:29 +0000828 test_support.run_unittest(__name__)
Barry Warsaw817918c2002-08-06 16:58:21 +0000829
Walter Dörwald28256f22003-01-19 16:59:20 +0000830if __name__ == "__main__":
831 test_main()