blob: 49ef29df0096697437ec722e2146a2996ec84a06 [file] [log] [blame]
Martin v. Löwisa729daf2002-08-04 17:28:33 +00001# -*- coding: iso-8859-1 -*-
Guido van Rossuma831cac2000-03-10 23:23:21 +00002""" Test script for the Unicode implementation.
3
Guido van Rossuma831cac2000-03-10 23:23:21 +00004Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
Marc-André Lemburg36619082001-01-17 19:11:13 +00008"""#"
Walter Dörwald0fd583c2003-02-21 12:53:50 +00009import unittest, sys, string, codecs, new
10from test import test_support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000011
Neal Norwitz430f68b2005-11-24 22:00:56 +000012# Error handling (bad decoder return)
13def search_function(encoding):
14 def decode1(input, errors="strict"):
15 return 42 # not a tuple
16 def encode1(input, errors="strict"):
17 return 42 # not a tuple
18 def encode2(input, errors="strict"):
19 return (42, 42) # no unicode
20 def decode2(input, errors="strict"):
21 return (42, 42) # no unicode
22 if encoding=="test.unicode1":
23 return (encode1, decode1, None, None)
24 elif encoding=="test.unicode2":
25 return (encode2, decode2, None, None)
26 else:
27 return None
28codecs.register(search_function)
29
Walter Dörwald0fd583c2003-02-21 12:53:50 +000030class UnicodeTest(
31 string_tests.CommonTest,
Walter Dörwald57d88e52004-08-26 16:53:04 +000032 string_tests.MixinStrUnicodeUserStringTest,
33 string_tests.MixinStrUnicodeTest,
Walter Dörwald0fd583c2003-02-21 12:53:50 +000034 ):
35 type2test = unicode
36
37 def checkequalnofix(self, result, object, methodname, *args):
38 method = getattr(object, methodname)
39 realresult = method(*args)
40 self.assertEqual(realresult, result)
41 self.assert_(type(realresult) is type(result))
42
43 # if the original is returned make sure that
44 # this doesn't happen with subclasses
45 if realresult is object:
46 class usub(unicode):
47 def __repr__(self):
48 return 'usub(%r)' % unicode.__repr__(self)
49 object = usub(object)
50 method = getattr(object, methodname)
51 realresult = method(*args)
52 self.assertEqual(realresult, result)
53 self.assert_(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000054
Jeremy Hylton504de6b2003-10-06 05:08:26 +000055 def test_literals(self):
56 self.assertEqual(u'\xff', u'\u00ff')
57 self.assertEqual(u'\uffff', u'\U0000ffff')
58 self.assertRaises(UnicodeError, eval, 'u\'\\Ufffffffe\'')
59 self.assertRaises(UnicodeError, eval, 'u\'\\Uffffffff\'')
60 self.assertRaises(UnicodeError, eval, 'u\'\\U%08x\'' % 0x110000)
61
Walter Dörwald28256f22003-01-19 16:59:20 +000062 def test_repr(self):
63 if not sys.platform.startswith('java'):
64 # Test basic sanity of repr()
65 self.assertEqual(repr(u'abc'), "u'abc'")
66 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
67 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
68 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
69 self.assertEqual(repr(u'\\'), "u'\\\\'")
70 self.assertEqual(repr(u'\n'), "u'\\n'")
71 self.assertEqual(repr(u'\r'), "u'\\r'")
72 self.assertEqual(repr(u'\t'), "u'\\t'")
73 self.assertEqual(repr(u'\b'), "u'\\x08'")
74 self.assertEqual(repr(u"'\""), """u'\\'"'""")
75 self.assertEqual(repr(u"'\""), """u'\\'"'""")
76 self.assertEqual(repr(u"'"), '''u"'"''')
77 self.assertEqual(repr(u'"'), """u'"'""")
78 latin1repr = (
79 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
80 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
81 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
82 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
83 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
84 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
85 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
86 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
87 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
88 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
89 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
90 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
91 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
92 "\\xfe\\xff'")
93 testrepr = repr(u''.join(map(unichr, xrange(256))))
94 self.assertEqual(testrepr, latin1repr)
95
Walter Dörwald28256f22003-01-19 16:59:20 +000096 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000097 string_tests.CommonTest.test_count(self)
98 # check mixed argument types
99 self.checkequalnofix(3, 'aaa', 'count', u'a')
100 self.checkequalnofix(0, 'aaa', 'count', u'b')
101 self.checkequalnofix(3, u'aaa', 'count', 'a')
102 self.checkequalnofix(0, u'aaa', 'count', 'b')
103 self.checkequalnofix(0, u'aaa', 'count', 'b')
104 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
105 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
106 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
107 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000108
Walter Dörwald28256f22003-01-19 16:59:20 +0000109 def test_find(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000110 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
111 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
112 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000113
Walter Dörwald28256f22003-01-19 16:59:20 +0000114 self.assertRaises(TypeError, u'hello'.find)
115 self.assertRaises(TypeError, u'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000116
Walter Dörwald28256f22003-01-19 16:59:20 +0000117 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000118 string_tests.CommonTest.test_rfind(self)
119 # check mixed argument types
120 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
121 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
122 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000123
Walter Dörwald28256f22003-01-19 16:59:20 +0000124 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000125 string_tests.CommonTest.test_index(self)
126 # check mixed argument types
127 for (t1, t2) in ((str, unicode), (unicode, str)):
128 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
129 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
130 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
131 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
132 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
133 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
134 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
135 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000136
Walter Dörwald28256f22003-01-19 16:59:20 +0000137 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000138 string_tests.CommonTest.test_rindex(self)
139 # check mixed argument types
140 for (t1, t2) in ((str, unicode), (unicode, str)):
141 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
142 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
143 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
144 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000145
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000146 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
147 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
148 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
149 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
150 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000151
Walter Dörwald28256f22003-01-19 16:59:20 +0000152 def test_translate(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000153 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
154 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
155 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
156 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
157 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
Walter Dörwaldcd736e72004-02-05 17:36:00 +0000158 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000159
Walter Dörwald28256f22003-01-19 16:59:20 +0000160 self.assertRaises(TypeError, u'hello'.translate)
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000161 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000162
Walter Dörwald28256f22003-01-19 16:59:20 +0000163 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000164 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000165
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000166 # Mixed arguments
167 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
168 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
169 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000170
Walter Dörwald28256f22003-01-19 16:59:20 +0000171 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000172 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000173
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000174 # mixed arguments
175 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
176 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
177 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
178 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
179 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
180 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
181 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
Marc-André Lemburge5034372000-08-08 08:04:29 +0000182
Walter Dörwald28256f22003-01-19 16:59:20 +0000183 def test_strip(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000184 string_tests.CommonTest.test_strip(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000185 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000186
Walter Dörwald28256f22003-01-19 16:59:20 +0000187 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000188 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000189
Walter Dörwald28256f22003-01-19 16:59:20 +0000190 # method call forwarded from str implementation because of unicode argument
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000191 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
Walter Dörwald28256f22003-01-19 16:59:20 +0000192 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000193
Walter Dörwald28256f22003-01-19 16:59:20 +0000194 def test_comparison(self):
195 # Comparisons:
196 self.assertEqual(u'abc', 'abc')
197 self.assertEqual('abc', u'abc')
198 self.assertEqual(u'abc', u'abc')
199 self.assert_(u'abcd' > 'abc')
200 self.assert_('abcd' > u'abc')
201 self.assert_(u'abcd' > u'abc')
202 self.assert_(u'abc' < 'abcd')
203 self.assert_('abc' < u'abcd')
204 self.assert_(u'abc' < u'abcd')
205
206 if 0:
207 # Move these tests to a Unicode collation module test...
208 # Testing UTF-16 code point order comparisons...
209
210 # No surrogates, no fixup required.
211 self.assert_(u'\u0061' < u'\u20ac')
212 # Non surrogate below surrogate value, no fixup required
213 self.assert_(u'\u0061' < u'\ud800\udc02')
214
215 # Non surrogate above surrogate value, fixup required
216 def test_lecmp(s, s2):
217 self.assert_(s < s2)
218
219 def test_fixup(s):
220 s2 = u'\ud800\udc01'
221 test_lecmp(s, s2)
222 s2 = u'\ud900\udc01'
223 test_lecmp(s, s2)
224 s2 = u'\uda00\udc01'
225 test_lecmp(s, s2)
226 s2 = u'\udb00\udc01'
227 test_lecmp(s, s2)
228 s2 = u'\ud800\udd01'
229 test_lecmp(s, s2)
230 s2 = u'\ud900\udd01'
231 test_lecmp(s, s2)
232 s2 = u'\uda00\udd01'
233 test_lecmp(s, s2)
234 s2 = u'\udb00\udd01'
235 test_lecmp(s, s2)
236 s2 = u'\ud800\ude01'
237 test_lecmp(s, s2)
238 s2 = u'\ud900\ude01'
239 test_lecmp(s, s2)
240 s2 = u'\uda00\ude01'
241 test_lecmp(s, s2)
242 s2 = u'\udb00\ude01'
243 test_lecmp(s, s2)
244 s2 = u'\ud800\udfff'
245 test_lecmp(s, s2)
246 s2 = u'\ud900\udfff'
247 test_lecmp(s, s2)
248 s2 = u'\uda00\udfff'
249 test_lecmp(s, s2)
250 s2 = u'\udb00\udfff'
251 test_lecmp(s, s2)
252
253 test_fixup(u'\ue000')
254 test_fixup(u'\uff61')
255
256 # Surrogates on both sides, no fixup required
257 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
258
Walter Dörwald28256f22003-01-19 16:59:20 +0000259 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000260 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
261 self.checkequalnofix(False, u'\u1FFc', 'islower')
Walter Dörwald28256f22003-01-19 16:59:20 +0000262
263 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000264 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
265 if not sys.platform.startswith('java'):
266 self.checkequalnofix(False, u'\u1FFc', 'isupper')
Walter Dörwald28256f22003-01-19 16:59:20 +0000267
268 def test_istitle(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000269 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
270 self.checkequalnofix(True, u'\u1FFc', 'istitle')
271 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000272
273 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000274 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
275 self.checkequalnofix(True, u'\u2000', 'isspace')
276 self.checkequalnofix(True, u'\u200a', 'isspace')
277 self.checkequalnofix(False, u'\u2014', 'isspace')
Walter Dörwald28256f22003-01-19 16:59:20 +0000278
279 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000280 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
281 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
Walter Dörwald28256f22003-01-19 16:59:20 +0000282
283 def test_isdecimal(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000284 self.checkequalnofix(False, u'', 'isdecimal')
285 self.checkequalnofix(False, u'a', 'isdecimal')
286 self.checkequalnofix(True, u'0', 'isdecimal')
287 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
288 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
289 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
290 self.checkequalnofix(True, u'0123456789', 'isdecimal')
291 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000292
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000293 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000294
295 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000296 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
297 self.checkequalnofix(True, u'\u2460', 'isdigit')
298 self.checkequalnofix(False, u'\xbc', 'isdigit')
299 self.checkequalnofix(True, u'\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000300
301 def test_isnumeric(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000302 self.checkequalnofix(False, u'', 'isnumeric')
303 self.checkequalnofix(False, u'a', 'isnumeric')
304 self.checkequalnofix(True, u'0', 'isnumeric')
305 self.checkequalnofix(True, u'\u2460', 'isnumeric')
306 self.checkequalnofix(True, u'\xbc', 'isnumeric')
307 self.checkequalnofix(True, u'\u0660', 'isnumeric')
308 self.checkequalnofix(True, u'0123456789', 'isnumeric')
309 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000310
311 self.assertRaises(TypeError, u"abc".isnumeric, 42)
312
Walter Dörwald28256f22003-01-19 16:59:20 +0000313 def test_contains(self):
314 # Testing Unicode contains method
315 self.assert_('a' in u'abdb')
316 self.assert_('a' in u'bdab')
317 self.assert_('a' in u'bdaba')
318 self.assert_('a' in u'bdba')
319 self.assert_('a' in u'bdba')
320 self.assert_(u'a' in u'bdba')
321 self.assert_(u'a' not in u'bdb')
322 self.assert_(u'a' not in 'bdb')
323 self.assert_(u'a' in 'bdba')
324 self.assert_(u'a' in ('a',1,None))
325 self.assert_(u'a' in (1,None,'a'))
326 self.assert_(u'a' in (1,None,u'a'))
327 self.assert_('a' in ('a',1,None))
328 self.assert_('a' in (1,None,'a'))
329 self.assert_('a' in (1,None,u'a'))
330 self.assert_('a' not in ('x',1,u'y'))
331 self.assert_('a' not in ('x',1,None))
332 self.assert_(u'abcd' not in u'abcxxxx')
333 self.assert_(u'ab' in u'abcd')
334 self.assert_('ab' in u'abc')
335 self.assert_(u'ab' in 'abc')
336 self.assert_(u'ab' in (1,None,u'ab'))
337 self.assert_(u'' in u'abc')
338 self.assert_('' in u'abc')
339
340 # If the following fails either
341 # the contains operator does not propagate UnicodeErrors or
342 # someone has changed the default encoding
343 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
344
345 self.assert_(u'' in '')
346 self.assert_('' in u'')
347 self.assert_(u'' in u'')
348 self.assert_(u'' in 'abc')
349 self.assert_('' in u'abc')
350 self.assert_(u'' in u'abc')
351 self.assert_(u'\0' not in 'abc')
352 self.assert_('\0' not in u'abc')
353 self.assert_(u'\0' not in u'abc')
354 self.assert_(u'\0' in '\0abc')
355 self.assert_('\0' in u'\0abc')
356 self.assert_(u'\0' in u'\0abc')
357 self.assert_(u'\0' in 'abc\0')
358 self.assert_('\0' in u'abc\0')
359 self.assert_(u'\0' in u'abc\0')
360 self.assert_(u'a' in '\0abc')
361 self.assert_('a' in u'\0abc')
362 self.assert_(u'a' in u'\0abc')
363 self.assert_(u'asdf' in 'asdf')
364 self.assert_('asdf' in u'asdf')
365 self.assert_(u'asdf' in u'asdf')
366 self.assert_(u'asdf' not in 'asd')
367 self.assert_('asdf' not in u'asd')
368 self.assert_(u'asdf' not in u'asd')
369 self.assert_(u'asdf' not in '')
370 self.assert_('asdf' not in u'')
371 self.assert_(u'asdf' not in u'')
372
373 self.assertRaises(TypeError, u"abc".__contains__)
374
375 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000376 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000377 # Testing Unicode formatting strings...
378 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
379 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
380 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
381 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
382 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
383 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000384 if not sys.platform.startswith('java'):
385 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
386 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000387 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000388
Walter Dörwald43440a62003-03-31 18:07:50 +0000389 self.assertEqual(u'%c' % 0x1234, u'\u1234')
Walter Dörwald44f527f2003-04-02 16:37:24 +0000390 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
Walter Dörwald28256f22003-01-19 16:59:20 +0000391
392 # formatting jobs delegated from the string implementation:
393 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
394 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
395 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
396 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
397 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
398 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
399 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
400 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
401 self.assertEqual('...%s...' % u"abc", u'...abc...')
402 self.assertEqual('%*s' % (5,u'abc',), u' abc')
403 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
404 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
405 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
406 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000407 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
Walter Dörwald43440a62003-03-31 18:07:50 +0000408 self.assertEqual('%c' % u'a', u'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000409 class Wrapper:
410 def __str__(self):
411 return u'\u1234'
412 self.assertEqual('%s' % Wrapper(), u'\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +0000413
Georg Brandlda6b1072006-01-20 17:48:54 +0000414 def test_format_float(self):
415 try:
416 import locale
417 orig_locale = locale.setlocale(locale.LC_ALL)
418 locale.setlocale(locale.LC_ALL, 'de_DE')
419 except (ImportError, locale.Error):
420 return # skip if we can't set locale
421
422 try:
423 # should not format with a comma, but always with C locale
424 self.assertEqual(u'1.0', u'%.1f' % 1.0)
425 finally:
426 locale.setlocale(locale.LC_ALL, orig_locale)
427
Walter Dörwald28256f22003-01-19 16:59:20 +0000428 def test_constructor(self):
429 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
430
431 self.assertEqual(
432 unicode(u'unicode remains unicode'),
433 u'unicode remains unicode'
434 )
435
436 class UnicodeSubclass(unicode):
Marc-André Lemburg79f57832002-12-29 19:44:06 +0000437 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +0000438
Walter Dörwald28256f22003-01-19 16:59:20 +0000439 self.assertEqual(
440 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
441 u'unicode subclass becomes unicode'
442 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000443
Walter Dörwald28256f22003-01-19 16:59:20 +0000444 self.assertEqual(
445 unicode('strings are converted to unicode'),
446 u'strings are converted to unicode'
447 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000448
Walter Dörwald28256f22003-01-19 16:59:20 +0000449 class UnicodeCompat:
450 def __init__(self, x):
451 self.x = x
452 def __unicode__(self):
453 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000454
Walter Dörwald28256f22003-01-19 16:59:20 +0000455 self.assertEqual(
456 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
457 u'__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000458
Walter Dörwald28256f22003-01-19 16:59:20 +0000459 class StringCompat:
460 def __init__(self, x):
461 self.x = x
462 def __str__(self):
463 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000464
Walter Dörwald28256f22003-01-19 16:59:20 +0000465 self.assertEqual(
466 unicode(StringCompat('__str__ compatible objects are recognized')),
467 u'__str__ compatible objects are recognized'
468 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000469
Walter Dörwald28256f22003-01-19 16:59:20 +0000470 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000471
Walter Dörwald28256f22003-01-19 16:59:20 +0000472 o = StringCompat('unicode(obj) is compatible to str()')
473 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
474 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000475
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000476 # %-formatting and .__unicode__()
477 self.assertEqual(u'%s' %
478 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
479 u"u'%s' % obj uses obj.__unicode__()")
480 self.assertEqual(u'%s' %
481 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
482 u"u'%s' % obj falls back to obj.__str__()")
483
Walter Dörwald28256f22003-01-19 16:59:20 +0000484 for obj in (123, 123.45, 123L):
485 self.assertEqual(unicode(obj), unicode(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000486
Walter Dörwald28256f22003-01-19 16:59:20 +0000487 # unicode(obj, encoding, error) tests (this maps to
488 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000489
Walter Dörwald28256f22003-01-19 16:59:20 +0000490 if not sys.platform.startswith('java'):
491 self.assertRaises(
492 TypeError,
493 unicode,
494 u'decoding unicode is not supported',
495 'utf-8',
496 'strict'
497 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000498
Walter Dörwald28256f22003-01-19 16:59:20 +0000499 self.assertEqual(
500 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
501 u'strings are decoded to unicode'
502 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000503
Walter Dörwald28256f22003-01-19 16:59:20 +0000504 if not sys.platform.startswith('java'):
505 self.assertEqual(
506 unicode(
507 buffer('character buffers are decoded to unicode'),
508 'utf-8',
509 'strict'
510 ),
511 u'character buffers are decoded to unicode'
512 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000513
Walter Dörwald28256f22003-01-19 16:59:20 +0000514 self.assertRaises(TypeError, unicode, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000515
Walter Dörwald28256f22003-01-19 16:59:20 +0000516 def test_codecs_utf7(self):
517 utfTests = [
518 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
519 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
520 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
521 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
522 (u'+', '+-'),
523 (u'+-', '+--'),
524 (u'+?', '+-?'),
525 (u'\?', '+AFw?'),
526 (u'+?', '+-?'),
527 (ur'\\?', '+AFwAXA?'),
528 (ur'\\\?', '+AFwAXABc?'),
529 (ur'++--', '+-+---')
530 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000531
Walter Dörwald28256f22003-01-19 16:59:20 +0000532 for (x, y) in utfTests:
533 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000534
Walter Dörwald28256f22003-01-19 16:59:20 +0000535 # surrogates not supported
536 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000537
Walter Dörwald28256f22003-01-19 16:59:20 +0000538 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000539
Walter Dörwald28256f22003-01-19 16:59:20 +0000540 def test_codecs_utf8(self):
541 self.assertEqual(u''.encode('utf-8'), '')
542 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
543 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
544 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
545 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
546 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
547 self.assertEqual(
548 (u'\ud800\udc02'*1000).encode('utf-8'),
549 '\xf0\x90\x80\x82'*1000
550 )
551 self.assertEqual(
552 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
553 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
554 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
555 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
556 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
557 u' Nunstuck git und'.encode('utf-8'),
558 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
559 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
560 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
561 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
562 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
563 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
564 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
565 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
566 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
567 '\xe3\x80\x8cWenn ist das Nunstuck git und'
568 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000569
Walter Dörwald28256f22003-01-19 16:59:20 +0000570 # UTF-8 specific decoding tests
571 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
572 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
573 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000574
Walter Dörwald28256f22003-01-19 16:59:20 +0000575 # Other possible utf-8 test cases:
576 # * strict decoding testing for all of the
577 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000578
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000579 def test_codecs_idna(self):
580 # Test whether trailing dot is preserved
581 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
582
Walter Dörwald28256f22003-01-19 16:59:20 +0000583 def test_codecs_errors(self):
584 # Error handling (encoding)
585 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
586 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
587 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
588 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000589
Walter Dörwald28256f22003-01-19 16:59:20 +0000590 # Error handling (decoding)
591 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
592 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
593 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
594 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000595
Walter Dörwald28256f22003-01-19 16:59:20 +0000596 # Error handling (unknown character names)
597 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000598
Walter Dörwald28256f22003-01-19 16:59:20 +0000599 # Error handling (truncated escape sequence)
600 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000601
Walter Dörwald28256f22003-01-19 16:59:20 +0000602 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
603 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
604 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
605 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
606 # executes PyUnicode_Encode()
607 import imp
608 self.assertRaises(
609 ImportError,
610 imp.find_module,
611 "non-existing module",
612 [u"non-existing dir"]
613 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000614
Walter Dörwald28256f22003-01-19 16:59:20 +0000615 # Error handling (wrong arguments)
616 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000617
Walter Dörwald28256f22003-01-19 16:59:20 +0000618 # Error handling (PyUnicode_EncodeDecimal())
619 self.assertRaises(UnicodeError, int, u"\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +0000620
Walter Dörwald28256f22003-01-19 16:59:20 +0000621 def test_codecs(self):
622 # Encoding
623 self.assertEqual(u'hello'.encode('ascii'), 'hello')
624 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
625 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
626 self.assertEqual(u'hello'.encode('utf8'), 'hello')
627 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
628 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
629 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +0000630
Walter Dörwald28256f22003-01-19 16:59:20 +0000631 # Roundtrip safety for BMP (just the first 1024 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000632 for c in xrange(1024):
633 u = unichr(c)
634 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
635 'utf-16-be', 'raw_unicode_escape',
636 'unicode_escape', 'unicode_internal'):
637 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +0000638
Walter Dörwald28256f22003-01-19 16:59:20 +0000639 # Roundtrip safety for BMP (just the first 256 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000640 for c in xrange(256):
641 u = unichr(c)
642 for encoding in ('latin-1',):
643 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000644
Walter Dörwald28256f22003-01-19 16:59:20 +0000645 # Roundtrip safety for BMP (just the first 128 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000646 for c in xrange(128):
647 u = unichr(c)
648 for encoding in ('ascii',):
649 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000650
Walter Dörwald28256f22003-01-19 16:59:20 +0000651 # Roundtrip safety for non-BMP (just a few chars)
652 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
653 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
654 #'raw_unicode_escape',
655 'unicode_escape', 'unicode_internal'):
656 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000657
Walter Dörwald28256f22003-01-19 16:59:20 +0000658 # UTF-8 must be roundtrip safe for all UCS-2 code points
659 # This excludes surrogates: in the full range, there would be
660 # a surrogate pair (\udbff\udc00), which gets converted back
661 # to a non-BMP character (\U0010fc00)
662 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
663 for encoding in ('utf-8',):
664 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000665
Walter Dörwald28256f22003-01-19 16:59:20 +0000666 def test_codecs_charmap(self):
667 # 0-127
668 s = ''.join(map(chr, xrange(128)))
669 for encoding in (
670 'cp037', 'cp1026',
671 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
672 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
673 'cp863', 'cp865', 'cp866',
674 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
675 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
676 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
677 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000678
Walter Dörwald28256f22003-01-19 16:59:20 +0000679 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
680 'cp1256', 'cp1257', 'cp1258',
681 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000682
Walter Dörwald28256f22003-01-19 16:59:20 +0000683 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
684 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000685
Walter Dörwald28256f22003-01-19 16:59:20 +0000686 ### These have undefined mappings:
687 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000688
Walter Dörwald28256f22003-01-19 16:59:20 +0000689 ### These fail the round-trip:
690 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +0000691
Walter Dörwald28256f22003-01-19 16:59:20 +0000692 ):
693 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000694
Walter Dörwald28256f22003-01-19 16:59:20 +0000695 # 128-255
696 s = ''.join(map(chr, xrange(128, 256)))
697 for encoding in (
698 'cp037', 'cp1026',
699 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
700 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
701 'cp863', 'cp865', 'cp866',
702 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
703 'iso8859_2', 'iso8859_4', 'iso8859_5',
704 'iso8859_9', 'koi8_r', 'latin_1',
705 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000706
Walter Dörwald28256f22003-01-19 16:59:20 +0000707 ### These have undefined mappings:
708 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
709 #'cp1256', 'cp1257', 'cp1258',
710 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
711 #'iso8859_3', 'iso8859_6', 'iso8859_7',
712 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000713
Walter Dörwald28256f22003-01-19 16:59:20 +0000714 ### These fail the round-trip:
715 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +0000716
Walter Dörwald28256f22003-01-19 16:59:20 +0000717 ):
718 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000719
Walter Dörwald28256f22003-01-19 16:59:20 +0000720 def test_concatenation(self):
721 self.assertEqual((u"abc" u"def"), u"abcdef")
722 self.assertEqual(("abc" u"def"), u"abcdef")
723 self.assertEqual((u"abc" "def"), u"abcdef")
724 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
725 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +0000726
Walter Dörwald28256f22003-01-19 16:59:20 +0000727 def test_printing(self):
728 class BitBucket:
729 def write(self, text):
730 pass
Fred Drake004d5e62000-10-23 17:22:08 +0000731
Walter Dörwald28256f22003-01-19 16:59:20 +0000732 out = BitBucket()
733 print >>out, u'abc'
734 print >>out, u'abc', u'def'
735 print >>out, u'abc', 'def'
736 print >>out, 'abc', u'def'
737 print >>out, u'abc\n'
738 print >>out, u'abc\n',
739 print >>out, u'abc\n',
740 print >>out, u'def\n'
741 print >>out, u'def\n'
Fred Drake004d5e62000-10-23 17:22:08 +0000742
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +0000743 def test_ucs4(self):
744 if sys.maxunicode == 0xFFFF:
745 return
746 x = u'\U00100000'
747 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
748 self.assertEqual(x, y)
749
Brett Cannonc3647ac2005-04-26 03:45:26 +0000750 def test_conversion(self):
751 # Make sure __unicode__() works properly
752 class Foo0:
753 def __str__(self):
754 return "foo"
755
756 class Foo1:
757 def __unicode__(self):
758 return u"foo"
759
760 class Foo2(object):
761 def __unicode__(self):
762 return u"foo"
763
764 class Foo3(object):
765 def __unicode__(self):
766 return "foo"
767
768 class Foo4(str):
769 def __unicode__(self):
770 return "foo"
771
772 class Foo5(unicode):
773 def __unicode__(self):
774 return "foo"
775
776 class Foo6(str):
777 def __str__(self):
778 return "foos"
779
780 def __unicode__(self):
781 return u"foou"
782
783 class Foo7(unicode):
784 def __str__(self):
785 return "foos"
786 def __unicode__(self):
787 return u"foou"
788
789 class Foo8(unicode):
790 def __new__(cls, content=""):
791 return unicode.__new__(cls, 2*content)
792 def __unicode__(self):
793 return self
794
795 class Foo9(unicode):
796 def __str__(self):
797 return "string"
798 def __unicode__(self):
799 return "not unicode"
800
801 self.assertEqual(unicode(Foo0()), u"foo")
802 self.assertEqual(unicode(Foo1()), u"foo")
803 self.assertEqual(unicode(Foo2()), u"foo")
804 self.assertEqual(unicode(Foo3()), u"foo")
805 self.assertEqual(unicode(Foo4("bar")), u"foo")
806 self.assertEqual(unicode(Foo5("bar")), u"foo")
807 self.assertEqual(unicode(Foo6("bar")), u"foou")
808 self.assertEqual(unicode(Foo7("bar")), u"foou")
809 self.assertEqual(unicode(Foo8("foo")), u"foofoo")
810 self.assertEqual(str(Foo9("foo")), "string")
811 self.assertEqual(unicode(Foo9("foo")), u"not unicode")
812
Walter Dörwald28256f22003-01-19 16:59:20 +0000813def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000814 test_support.run_unittest(UnicodeTest)
Barry Warsaw817918c2002-08-06 16:58:21 +0000815
Walter Dörwald28256f22003-01-19 16:59:20 +0000816if __name__ == "__main__":
817 test_main()