blob: 517ecfddfeb1693ab589ff1a0de5e16c0c1f5a4e [file] [log] [blame]
Martin v. Löwisa729daf2002-08-04 17:28:33 +00001# -*- coding: iso-8859-1 -*-
Guido van Rossuma831cac2000-03-10 23:23:21 +00002""" Test script for the Unicode implementation.
3
Guido van Rossuma831cac2000-03-10 23:23:21 +00004Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
Marc-André Lemburg36619082001-01-17 19:11:13 +00008"""#"
Walter Dörwald0fd583c2003-02-21 12:53:50 +00009import unittest, sys, string, codecs, new
10from test import test_support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000011
Neal Norwitz430f68b2005-11-24 22:00:56 +000012# Error handling (bad decoder return)
13def search_function(encoding):
14 def decode1(input, errors="strict"):
15 return 42 # not a tuple
16 def encode1(input, errors="strict"):
17 return 42 # not a tuple
18 def encode2(input, errors="strict"):
19 return (42, 42) # no unicode
20 def decode2(input, errors="strict"):
21 return (42, 42) # no unicode
22 if encoding=="test.unicode1":
23 return (encode1, decode1, None, None)
24 elif encoding=="test.unicode2":
25 return (encode2, decode2, None, None)
26 else:
27 return None
28codecs.register(search_function)
29
Walter Dörwald0fd583c2003-02-21 12:53:50 +000030class UnicodeTest(
31 string_tests.CommonTest,
Walter Dörwald57d88e52004-08-26 16:53:04 +000032 string_tests.MixinStrUnicodeUserStringTest,
33 string_tests.MixinStrUnicodeTest,
Walter Dörwald0fd583c2003-02-21 12:53:50 +000034 ):
35 type2test = unicode
36
37 def checkequalnofix(self, result, object, methodname, *args):
38 method = getattr(object, methodname)
39 realresult = method(*args)
40 self.assertEqual(realresult, result)
41 self.assert_(type(realresult) is type(result))
42
43 # if the original is returned make sure that
44 # this doesn't happen with subclasses
45 if realresult is object:
46 class usub(unicode):
47 def __repr__(self):
48 return 'usub(%r)' % unicode.__repr__(self)
49 object = usub(object)
50 method = getattr(object, methodname)
51 realresult = method(*args)
52 self.assertEqual(realresult, result)
53 self.assert_(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000054
Jeremy Hylton504de6b2003-10-06 05:08:26 +000055 def test_literals(self):
56 self.assertEqual(u'\xff', u'\u00ff')
57 self.assertEqual(u'\uffff', u'\U0000ffff')
58 self.assertRaises(UnicodeError, eval, 'u\'\\Ufffffffe\'')
59 self.assertRaises(UnicodeError, eval, 'u\'\\Uffffffff\'')
60 self.assertRaises(UnicodeError, eval, 'u\'\\U%08x\'' % 0x110000)
61
Walter Dörwald28256f22003-01-19 16:59:20 +000062 def test_repr(self):
63 if not sys.platform.startswith('java'):
64 # Test basic sanity of repr()
65 self.assertEqual(repr(u'abc'), "u'abc'")
66 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
67 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
68 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
69 self.assertEqual(repr(u'\\'), "u'\\\\'")
70 self.assertEqual(repr(u'\n'), "u'\\n'")
71 self.assertEqual(repr(u'\r'), "u'\\r'")
72 self.assertEqual(repr(u'\t'), "u'\\t'")
73 self.assertEqual(repr(u'\b'), "u'\\x08'")
74 self.assertEqual(repr(u"'\""), """u'\\'"'""")
75 self.assertEqual(repr(u"'\""), """u'\\'"'""")
76 self.assertEqual(repr(u"'"), '''u"'"''')
77 self.assertEqual(repr(u'"'), """u'"'""")
78 latin1repr = (
79 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
80 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
81 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
82 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
83 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
84 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
85 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
86 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
87 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
88 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
89 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
90 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
91 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
92 "\\xfe\\xff'")
93 testrepr = repr(u''.join(map(unichr, xrange(256))))
94 self.assertEqual(testrepr, latin1repr)
95
Guido van Rossum49d6b072006-08-17 21:11:47 +000096 def test_iterators(self):
97 # Make sure unicode objects have an __iter__ method
98 it = u"\u1111\u2222\u3333".__iter__()
99 self.assertEqual(it.next(), u"\u1111")
100 self.assertEqual(it.next(), u"\u2222")
101 self.assertEqual(it.next(), u"\u3333")
102 self.assertRaises(StopIteration, it.next)
103
Walter Dörwald28256f22003-01-19 16:59:20 +0000104 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000105 string_tests.CommonTest.test_count(self)
106 # check mixed argument types
107 self.checkequalnofix(3, 'aaa', 'count', u'a')
108 self.checkequalnofix(0, 'aaa', 'count', u'b')
109 self.checkequalnofix(3, u'aaa', 'count', 'a')
110 self.checkequalnofix(0, u'aaa', 'count', 'b')
111 self.checkequalnofix(0, u'aaa', 'count', 'b')
112 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
113 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
114 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
115 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000116
Walter Dörwald28256f22003-01-19 16:59:20 +0000117 def test_find(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000118 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
119 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
120 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000121
Walter Dörwald28256f22003-01-19 16:59:20 +0000122 self.assertRaises(TypeError, u'hello'.find)
123 self.assertRaises(TypeError, u'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000124
Walter Dörwald28256f22003-01-19 16:59:20 +0000125 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000126 string_tests.CommonTest.test_rfind(self)
127 # check mixed argument types
128 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
129 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
130 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000131
Walter Dörwald28256f22003-01-19 16:59:20 +0000132 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000133 string_tests.CommonTest.test_index(self)
134 # check mixed argument types
135 for (t1, t2) in ((str, unicode), (unicode, str)):
136 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
137 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
138 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
139 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
140 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
141 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
142 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
143 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000144
Walter Dörwald28256f22003-01-19 16:59:20 +0000145 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000146 string_tests.CommonTest.test_rindex(self)
147 # check mixed argument types
148 for (t1, t2) in ((str, unicode), (unicode, str)):
149 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
150 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
151 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
152 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000153
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000154 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
155 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
156 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
157 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
158 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000159
Walter Dörwald28256f22003-01-19 16:59:20 +0000160 def test_translate(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000161 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
162 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
163 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
164 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
165 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
Walter Dörwaldcd736e72004-02-05 17:36:00 +0000166 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000167
Walter Dörwald28256f22003-01-19 16:59:20 +0000168 self.assertRaises(TypeError, u'hello'.translate)
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000169 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000170
Walter Dörwald28256f22003-01-19 16:59:20 +0000171 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000172 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000173
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000174 # Mixed arguments
175 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
176 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
177 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000178
Walter Dörwald28256f22003-01-19 16:59:20 +0000179 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000180 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000181
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000182 # mixed arguments
183 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
184 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
185 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
186 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
187 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
188 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
189 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
Marc-André Lemburge5034372000-08-08 08:04:29 +0000190
Walter Dörwald28256f22003-01-19 16:59:20 +0000191 def test_strip(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000192 string_tests.CommonTest.test_strip(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000193 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000194
Walter Dörwald28256f22003-01-19 16:59:20 +0000195 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000196 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000197
Walter Dörwald28256f22003-01-19 16:59:20 +0000198 # method call forwarded from str implementation because of unicode argument
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000199 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
Walter Dörwald28256f22003-01-19 16:59:20 +0000200 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000201
Walter Dörwald28256f22003-01-19 16:59:20 +0000202 def test_comparison(self):
203 # Comparisons:
204 self.assertEqual(u'abc', 'abc')
205 self.assertEqual('abc', u'abc')
206 self.assertEqual(u'abc', u'abc')
207 self.assert_(u'abcd' > 'abc')
208 self.assert_('abcd' > u'abc')
209 self.assert_(u'abcd' > u'abc')
210 self.assert_(u'abc' < 'abcd')
211 self.assert_('abc' < u'abcd')
212 self.assert_(u'abc' < u'abcd')
213
214 if 0:
215 # Move these tests to a Unicode collation module test...
216 # Testing UTF-16 code point order comparisons...
217
218 # No surrogates, no fixup required.
219 self.assert_(u'\u0061' < u'\u20ac')
220 # Non surrogate below surrogate value, no fixup required
221 self.assert_(u'\u0061' < u'\ud800\udc02')
222
223 # Non surrogate above surrogate value, fixup required
224 def test_lecmp(s, s2):
225 self.assert_(s < s2)
226
227 def test_fixup(s):
228 s2 = u'\ud800\udc01'
229 test_lecmp(s, s2)
230 s2 = u'\ud900\udc01'
231 test_lecmp(s, s2)
232 s2 = u'\uda00\udc01'
233 test_lecmp(s, s2)
234 s2 = u'\udb00\udc01'
235 test_lecmp(s, s2)
236 s2 = u'\ud800\udd01'
237 test_lecmp(s, s2)
238 s2 = u'\ud900\udd01'
239 test_lecmp(s, s2)
240 s2 = u'\uda00\udd01'
241 test_lecmp(s, s2)
242 s2 = u'\udb00\udd01'
243 test_lecmp(s, s2)
244 s2 = u'\ud800\ude01'
245 test_lecmp(s, s2)
246 s2 = u'\ud900\ude01'
247 test_lecmp(s, s2)
248 s2 = u'\uda00\ude01'
249 test_lecmp(s, s2)
250 s2 = u'\udb00\ude01'
251 test_lecmp(s, s2)
252 s2 = u'\ud800\udfff'
253 test_lecmp(s, s2)
254 s2 = u'\ud900\udfff'
255 test_lecmp(s, s2)
256 s2 = u'\uda00\udfff'
257 test_lecmp(s, s2)
258 s2 = u'\udb00\udfff'
259 test_lecmp(s, s2)
260
261 test_fixup(u'\ue000')
262 test_fixup(u'\uff61')
263
264 # Surrogates on both sides, no fixup required
265 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
266
Walter Dörwald28256f22003-01-19 16:59:20 +0000267 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000268 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
269 self.checkequalnofix(False, u'\u1FFc', 'islower')
Walter Dörwald28256f22003-01-19 16:59:20 +0000270
271 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000272 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
273 if not sys.platform.startswith('java'):
274 self.checkequalnofix(False, u'\u1FFc', 'isupper')
Walter Dörwald28256f22003-01-19 16:59:20 +0000275
276 def test_istitle(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000277 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
278 self.checkequalnofix(True, u'\u1FFc', 'istitle')
279 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000280
281 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000282 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
283 self.checkequalnofix(True, u'\u2000', 'isspace')
284 self.checkequalnofix(True, u'\u200a', 'isspace')
285 self.checkequalnofix(False, u'\u2014', 'isspace')
Walter Dörwald28256f22003-01-19 16:59:20 +0000286
287 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000288 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
289 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
Walter Dörwald28256f22003-01-19 16:59:20 +0000290
291 def test_isdecimal(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000292 self.checkequalnofix(False, u'', 'isdecimal')
293 self.checkequalnofix(False, u'a', 'isdecimal')
294 self.checkequalnofix(True, u'0', 'isdecimal')
295 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
296 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
297 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
298 self.checkequalnofix(True, u'0123456789', 'isdecimal')
299 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000300
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000301 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000302
303 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000304 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
305 self.checkequalnofix(True, u'\u2460', 'isdigit')
306 self.checkequalnofix(False, u'\xbc', 'isdigit')
307 self.checkequalnofix(True, u'\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000308
309 def test_isnumeric(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000310 self.checkequalnofix(False, u'', 'isnumeric')
311 self.checkequalnofix(False, u'a', 'isnumeric')
312 self.checkequalnofix(True, u'0', 'isnumeric')
313 self.checkequalnofix(True, u'\u2460', 'isnumeric')
314 self.checkequalnofix(True, u'\xbc', 'isnumeric')
315 self.checkequalnofix(True, u'\u0660', 'isnumeric')
316 self.checkequalnofix(True, u'0123456789', 'isnumeric')
317 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000318
319 self.assertRaises(TypeError, u"abc".isnumeric, 42)
320
Walter Dörwald28256f22003-01-19 16:59:20 +0000321 def test_contains(self):
322 # Testing Unicode contains method
323 self.assert_('a' in u'abdb')
324 self.assert_('a' in u'bdab')
325 self.assert_('a' in u'bdaba')
326 self.assert_('a' in u'bdba')
327 self.assert_('a' in u'bdba')
328 self.assert_(u'a' in u'bdba')
329 self.assert_(u'a' not in u'bdb')
330 self.assert_(u'a' not in 'bdb')
331 self.assert_(u'a' in 'bdba')
332 self.assert_(u'a' in ('a',1,None))
333 self.assert_(u'a' in (1,None,'a'))
334 self.assert_(u'a' in (1,None,u'a'))
335 self.assert_('a' in ('a',1,None))
336 self.assert_('a' in (1,None,'a'))
337 self.assert_('a' in (1,None,u'a'))
338 self.assert_('a' not in ('x',1,u'y'))
339 self.assert_('a' not in ('x',1,None))
340 self.assert_(u'abcd' not in u'abcxxxx')
341 self.assert_(u'ab' in u'abcd')
342 self.assert_('ab' in u'abc')
343 self.assert_(u'ab' in 'abc')
344 self.assert_(u'ab' in (1,None,u'ab'))
345 self.assert_(u'' in u'abc')
346 self.assert_('' in u'abc')
347
348 # If the following fails either
349 # the contains operator does not propagate UnicodeErrors or
350 # someone has changed the default encoding
351 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
352
353 self.assert_(u'' in '')
354 self.assert_('' in u'')
355 self.assert_(u'' in u'')
356 self.assert_(u'' in 'abc')
357 self.assert_('' in u'abc')
358 self.assert_(u'' in u'abc')
359 self.assert_(u'\0' not in 'abc')
360 self.assert_('\0' not in u'abc')
361 self.assert_(u'\0' not in u'abc')
362 self.assert_(u'\0' in '\0abc')
363 self.assert_('\0' in u'\0abc')
364 self.assert_(u'\0' in u'\0abc')
365 self.assert_(u'\0' in 'abc\0')
366 self.assert_('\0' in u'abc\0')
367 self.assert_(u'\0' in u'abc\0')
368 self.assert_(u'a' in '\0abc')
369 self.assert_('a' in u'\0abc')
370 self.assert_(u'a' in u'\0abc')
371 self.assert_(u'asdf' in 'asdf')
372 self.assert_('asdf' in u'asdf')
373 self.assert_(u'asdf' in u'asdf')
374 self.assert_(u'asdf' not in 'asd')
375 self.assert_('asdf' not in u'asd')
376 self.assert_(u'asdf' not in u'asd')
377 self.assert_(u'asdf' not in '')
378 self.assert_('asdf' not in u'')
379 self.assert_(u'asdf' not in u'')
380
381 self.assertRaises(TypeError, u"abc".__contains__)
382
383 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000384 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000385 # Testing Unicode formatting strings...
386 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
387 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
388 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
389 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
390 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
391 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000392 if not sys.platform.startswith('java'):
393 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
394 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000395 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000396
Walter Dörwald43440a62003-03-31 18:07:50 +0000397 self.assertEqual(u'%c' % 0x1234, u'\u1234')
Walter Dörwald44f527f2003-04-02 16:37:24 +0000398 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
Walter Dörwald28256f22003-01-19 16:59:20 +0000399
400 # formatting jobs delegated from the string implementation:
401 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
402 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
403 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
404 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
405 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
406 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
407 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
408 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
409 self.assertEqual('...%s...' % u"abc", u'...abc...')
410 self.assertEqual('%*s' % (5,u'abc',), u' abc')
411 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
412 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
413 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
414 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000415 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
Walter Dörwald43440a62003-03-31 18:07:50 +0000416 self.assertEqual('%c' % u'a', u'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000417 class Wrapper:
418 def __str__(self):
419 return u'\u1234'
420 self.assertEqual('%s' % Wrapper(), u'\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +0000421
Thomas Wouters477c8d52006-05-27 19:21:47 +0000422 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000423 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000424 # should not format with a comma, but always with C locale
425 self.assertEqual(u'1.0', u'%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +0000426
Walter Dörwald28256f22003-01-19 16:59:20 +0000427 def test_constructor(self):
428 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
429
430 self.assertEqual(
431 unicode(u'unicode remains unicode'),
432 u'unicode remains unicode'
433 )
434
435 class UnicodeSubclass(unicode):
Marc-André Lemburg79f57832002-12-29 19:44:06 +0000436 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +0000437
Walter Dörwald28256f22003-01-19 16:59:20 +0000438 self.assertEqual(
439 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
440 u'unicode subclass becomes unicode'
441 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000442
Walter Dörwald28256f22003-01-19 16:59:20 +0000443 self.assertEqual(
444 unicode('strings are converted to unicode'),
445 u'strings are converted to unicode'
446 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000447
Walter Dörwald28256f22003-01-19 16:59:20 +0000448 class UnicodeCompat:
449 def __init__(self, x):
450 self.x = x
451 def __unicode__(self):
452 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000453
Walter Dörwald28256f22003-01-19 16:59:20 +0000454 self.assertEqual(
455 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
456 u'__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000457
Walter Dörwald28256f22003-01-19 16:59:20 +0000458 class StringCompat:
459 def __init__(self, x):
460 self.x = x
461 def __str__(self):
462 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000463
Walter Dörwald28256f22003-01-19 16:59:20 +0000464 self.assertEqual(
465 unicode(StringCompat('__str__ compatible objects are recognized')),
466 u'__str__ compatible objects are recognized'
467 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000468
Walter Dörwald28256f22003-01-19 16:59:20 +0000469 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000470
Walter Dörwald28256f22003-01-19 16:59:20 +0000471 o = StringCompat('unicode(obj) is compatible to str()')
472 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
473 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000474
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000475 # %-formatting and .__unicode__()
476 self.assertEqual(u'%s' %
477 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
478 u"u'%s' % obj uses obj.__unicode__()")
479 self.assertEqual(u'%s' %
480 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
481 u"u'%s' % obj falls back to obj.__str__()")
482
Walter Dörwald28256f22003-01-19 16:59:20 +0000483 for obj in (123, 123.45, 123L):
484 self.assertEqual(unicode(obj), unicode(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000485
Walter Dörwald28256f22003-01-19 16:59:20 +0000486 # unicode(obj, encoding, error) tests (this maps to
487 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000488
Walter Dörwald28256f22003-01-19 16:59:20 +0000489 if not sys.platform.startswith('java'):
490 self.assertRaises(
491 TypeError,
492 unicode,
493 u'decoding unicode is not supported',
494 'utf-8',
495 'strict'
496 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000497
Walter Dörwald28256f22003-01-19 16:59:20 +0000498 self.assertEqual(
499 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
500 u'strings are decoded to unicode'
501 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000502
Walter Dörwald28256f22003-01-19 16:59:20 +0000503 if not sys.platform.startswith('java'):
504 self.assertEqual(
505 unicode(
506 buffer('character buffers are decoded to unicode'),
507 'utf-8',
508 'strict'
509 ),
510 u'character buffers are decoded to unicode'
511 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000512
Walter Dörwald28256f22003-01-19 16:59:20 +0000513 self.assertRaises(TypeError, unicode, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000514
Walter Dörwald28256f22003-01-19 16:59:20 +0000515 def test_codecs_utf7(self):
516 utfTests = [
517 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
518 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
519 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
520 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
521 (u'+', '+-'),
522 (u'+-', '+--'),
523 (u'+?', '+-?'),
524 (u'\?', '+AFw?'),
525 (u'+?', '+-?'),
526 (ur'\\?', '+AFwAXA?'),
527 (ur'\\\?', '+AFwAXABc?'),
528 (ur'++--', '+-+---')
529 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000530
Walter Dörwald28256f22003-01-19 16:59:20 +0000531 for (x, y) in utfTests:
532 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000533
Walter Dörwald28256f22003-01-19 16:59:20 +0000534 # surrogates not supported
535 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000536
Walter Dörwald28256f22003-01-19 16:59:20 +0000537 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000538
Walter Dörwald28256f22003-01-19 16:59:20 +0000539 def test_codecs_utf8(self):
540 self.assertEqual(u''.encode('utf-8'), '')
541 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
542 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
543 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
544 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
545 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
546 self.assertEqual(
547 (u'\ud800\udc02'*1000).encode('utf-8'),
548 '\xf0\x90\x80\x82'*1000
549 )
550 self.assertEqual(
551 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
552 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
553 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
554 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
555 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
556 u' Nunstuck git und'.encode('utf-8'),
557 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
558 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
559 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
560 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
561 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
562 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
563 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
564 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
565 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
566 '\xe3\x80\x8cWenn ist das Nunstuck git und'
567 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000568
Walter Dörwald28256f22003-01-19 16:59:20 +0000569 # UTF-8 specific decoding tests
570 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
571 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
572 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000573
Walter Dörwald28256f22003-01-19 16:59:20 +0000574 # Other possible utf-8 test cases:
575 # * strict decoding testing for all of the
576 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000577
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000578 def test_codecs_idna(self):
579 # Test whether trailing dot is preserved
580 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
581
Walter Dörwald28256f22003-01-19 16:59:20 +0000582 def test_codecs_errors(self):
583 # Error handling (encoding)
584 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
585 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
586 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
587 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000588
Walter Dörwald28256f22003-01-19 16:59:20 +0000589 # Error handling (decoding)
590 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
591 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
592 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
593 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000594
Walter Dörwald28256f22003-01-19 16:59:20 +0000595 # Error handling (unknown character names)
596 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000597
Walter Dörwald28256f22003-01-19 16:59:20 +0000598 # Error handling (truncated escape sequence)
599 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000600
Walter Dörwald28256f22003-01-19 16:59:20 +0000601 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
602 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
603 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
604 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
605 # executes PyUnicode_Encode()
606 import imp
607 self.assertRaises(
608 ImportError,
609 imp.find_module,
610 "non-existing module",
611 [u"non-existing dir"]
612 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000613
Walter Dörwald28256f22003-01-19 16:59:20 +0000614 # Error handling (wrong arguments)
615 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000616
Walter Dörwald28256f22003-01-19 16:59:20 +0000617 # Error handling (PyUnicode_EncodeDecimal())
618 self.assertRaises(UnicodeError, int, u"\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +0000619
Walter Dörwald28256f22003-01-19 16:59:20 +0000620 def test_codecs(self):
621 # Encoding
622 self.assertEqual(u'hello'.encode('ascii'), 'hello')
623 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
624 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
625 self.assertEqual(u'hello'.encode('utf8'), 'hello')
626 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
627 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
628 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +0000629
Walter Dörwald28256f22003-01-19 16:59:20 +0000630 # Roundtrip safety for BMP (just the first 1024 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000631 for c in xrange(1024):
632 u = unichr(c)
633 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
634 'utf-16-be', 'raw_unicode_escape',
635 'unicode_escape', 'unicode_internal'):
636 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +0000637
Walter Dörwald28256f22003-01-19 16:59:20 +0000638 # Roundtrip safety for BMP (just the first 256 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000639 for c in xrange(256):
640 u = unichr(c)
641 for encoding in ('latin-1',):
642 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000643
Walter Dörwald28256f22003-01-19 16:59:20 +0000644 # Roundtrip safety for BMP (just the first 128 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000645 for c in xrange(128):
646 u = unichr(c)
647 for encoding in ('ascii',):
648 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000649
Walter Dörwald28256f22003-01-19 16:59:20 +0000650 # Roundtrip safety for non-BMP (just a few chars)
651 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
652 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
653 #'raw_unicode_escape',
654 'unicode_escape', 'unicode_internal'):
655 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000656
Walter Dörwald28256f22003-01-19 16:59:20 +0000657 # UTF-8 must be roundtrip safe for all UCS-2 code points
658 # This excludes surrogates: in the full range, there would be
659 # a surrogate pair (\udbff\udc00), which gets converted back
660 # to a non-BMP character (\U0010fc00)
661 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
662 for encoding in ('utf-8',):
663 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000664
Walter Dörwald28256f22003-01-19 16:59:20 +0000665 def test_codecs_charmap(self):
666 # 0-127
667 s = ''.join(map(chr, xrange(128)))
668 for encoding in (
669 'cp037', 'cp1026',
670 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
671 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
672 'cp863', 'cp865', 'cp866',
673 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
674 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
675 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
676 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000677
Walter Dörwald28256f22003-01-19 16:59:20 +0000678 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
679 'cp1256', 'cp1257', 'cp1258',
680 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000681
Walter Dörwald28256f22003-01-19 16:59:20 +0000682 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
683 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000684
Walter Dörwald28256f22003-01-19 16:59:20 +0000685 ### These have undefined mappings:
686 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000687
Walter Dörwald28256f22003-01-19 16:59:20 +0000688 ### These fail the round-trip:
689 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +0000690
Walter Dörwald28256f22003-01-19 16:59:20 +0000691 ):
692 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000693
Walter Dörwald28256f22003-01-19 16:59:20 +0000694 # 128-255
695 s = ''.join(map(chr, xrange(128, 256)))
696 for encoding in (
697 'cp037', 'cp1026',
698 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
699 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
700 'cp863', 'cp865', 'cp866',
701 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
702 'iso8859_2', 'iso8859_4', 'iso8859_5',
703 'iso8859_9', 'koi8_r', 'latin_1',
704 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000705
Walter Dörwald28256f22003-01-19 16:59:20 +0000706 ### These have undefined mappings:
707 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
708 #'cp1256', 'cp1257', 'cp1258',
709 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
710 #'iso8859_3', 'iso8859_6', 'iso8859_7',
711 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000712
Walter Dörwald28256f22003-01-19 16:59:20 +0000713 ### These fail the round-trip:
714 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +0000715
Walter Dörwald28256f22003-01-19 16:59:20 +0000716 ):
717 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000718
Walter Dörwald28256f22003-01-19 16:59:20 +0000719 def test_concatenation(self):
720 self.assertEqual((u"abc" u"def"), u"abcdef")
721 self.assertEqual(("abc" u"def"), u"abcdef")
722 self.assertEqual((u"abc" "def"), u"abcdef")
723 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
724 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +0000725
Walter Dörwald28256f22003-01-19 16:59:20 +0000726 def test_printing(self):
727 class BitBucket:
728 def write(self, text):
729 pass
Fred Drake004d5e62000-10-23 17:22:08 +0000730
Walter Dörwald28256f22003-01-19 16:59:20 +0000731 out = BitBucket()
732 print >>out, u'abc'
733 print >>out, u'abc', u'def'
734 print >>out, u'abc', 'def'
735 print >>out, 'abc', u'def'
736 print >>out, u'abc\n'
737 print >>out, u'abc\n',
738 print >>out, u'abc\n',
739 print >>out, u'def\n'
740 print >>out, u'def\n'
Fred Drake004d5e62000-10-23 17:22:08 +0000741
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +0000742 def test_ucs4(self):
743 if sys.maxunicode == 0xFFFF:
744 return
745 x = u'\U00100000'
746 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
747 self.assertEqual(x, y)
748
Brett Cannonc3647ac2005-04-26 03:45:26 +0000749 def test_conversion(self):
750 # Make sure __unicode__() works properly
751 class Foo0:
752 def __str__(self):
753 return "foo"
754
755 class Foo1:
756 def __unicode__(self):
757 return u"foo"
758
759 class Foo2(object):
760 def __unicode__(self):
761 return u"foo"
762
763 class Foo3(object):
764 def __unicode__(self):
765 return "foo"
766
767 class Foo4(str):
768 def __unicode__(self):
769 return "foo"
770
771 class Foo5(unicode):
772 def __unicode__(self):
773 return "foo"
774
775 class Foo6(str):
776 def __str__(self):
777 return "foos"
778
779 def __unicode__(self):
780 return u"foou"
781
782 class Foo7(unicode):
783 def __str__(self):
784 return "foos"
785 def __unicode__(self):
786 return u"foou"
787
788 class Foo8(unicode):
789 def __new__(cls, content=""):
790 return unicode.__new__(cls, 2*content)
791 def __unicode__(self):
792 return self
793
794 class Foo9(unicode):
795 def __str__(self):
796 return "string"
797 def __unicode__(self):
798 return "not unicode"
799
800 self.assertEqual(unicode(Foo0()), u"foo")
801 self.assertEqual(unicode(Foo1()), u"foo")
802 self.assertEqual(unicode(Foo2()), u"foo")
803 self.assertEqual(unicode(Foo3()), u"foo")
804 self.assertEqual(unicode(Foo4("bar")), u"foo")
805 self.assertEqual(unicode(Foo5("bar")), u"foo")
806 self.assertEqual(unicode(Foo6("bar")), u"foou")
807 self.assertEqual(unicode(Foo7("bar")), u"foou")
808 self.assertEqual(unicode(Foo8("foo")), u"foofoo")
809 self.assertEqual(str(Foo9("foo")), "string")
810 self.assertEqual(unicode(Foo9("foo")), u"not unicode")
811
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000812 def test_unicode_repr(self):
813 class s1:
814 def __repr__(self):
815 return '\\n'
816
817 class s2:
818 def __repr__(self):
819 return u'\\n'
820
821 self.assertEqual(repr(s1()), '\\n')
822 self.assertEqual(repr(s2()), '\\n')
823
824
825
826
827
Walter Dörwald28256f22003-01-19 16:59:20 +0000828def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000829 test_support.run_unittest(UnicodeTest)
Barry Warsaw817918c2002-08-06 16:58:21 +0000830
Walter Dörwald28256f22003-01-19 16:59:20 +0000831if __name__ == "__main__":
832 test_main()