blob: bb3338b450ea0d6795ac70adc2c98c4c5ed4cf34 [file] [log] [blame]
Martin v. Löwisa729daf2002-08-04 17:28:33 +00001# -*- coding: iso-8859-1 -*-
Guido van Rossuma831cac2000-03-10 23:23:21 +00002""" Test script for the Unicode implementation.
3
Guido van Rossuma831cac2000-03-10 23:23:21 +00004Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
Marc-André Lemburg36619082001-01-17 19:11:13 +00008"""#"
Walter Dörwald0fd583c2003-02-21 12:53:50 +00009import unittest, sys, string, codecs, new
10from test import test_support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000011
Neal Norwitz430f68b2005-11-24 22:00:56 +000012# Error handling (bad decoder return)
13def search_function(encoding):
14 def decode1(input, errors="strict"):
15 return 42 # not a tuple
16 def encode1(input, errors="strict"):
17 return 42 # not a tuple
18 def encode2(input, errors="strict"):
19 return (42, 42) # no unicode
20 def decode2(input, errors="strict"):
21 return (42, 42) # no unicode
22 if encoding=="test.unicode1":
23 return (encode1, decode1, None, None)
24 elif encoding=="test.unicode2":
25 return (encode2, decode2, None, None)
26 else:
27 return None
28codecs.register(search_function)
29
Walter Dörwald0fd583c2003-02-21 12:53:50 +000030class UnicodeTest(
31 string_tests.CommonTest,
Walter Dörwald57d88e52004-08-26 16:53:04 +000032 string_tests.MixinStrUnicodeUserStringTest,
33 string_tests.MixinStrUnicodeTest,
Walter Dörwald0fd583c2003-02-21 12:53:50 +000034 ):
35 type2test = unicode
36
37 def checkequalnofix(self, result, object, methodname, *args):
38 method = getattr(object, methodname)
39 realresult = method(*args)
40 self.assertEqual(realresult, result)
41 self.assert_(type(realresult) is type(result))
42
43 # if the original is returned make sure that
44 # this doesn't happen with subclasses
45 if realresult is object:
46 class usub(unicode):
47 def __repr__(self):
48 return 'usub(%r)' % unicode.__repr__(self)
49 object = usub(object)
50 method = getattr(object, methodname)
51 realresult = method(*args)
52 self.assertEqual(realresult, result)
53 self.assert_(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000054
Jeremy Hylton504de6b2003-10-06 05:08:26 +000055 def test_literals(self):
56 self.assertEqual(u'\xff', u'\u00ff')
57 self.assertEqual(u'\uffff', u'\U0000ffff')
58 self.assertRaises(UnicodeError, eval, 'u\'\\Ufffffffe\'')
59 self.assertRaises(UnicodeError, eval, 'u\'\\Uffffffff\'')
60 self.assertRaises(UnicodeError, eval, 'u\'\\U%08x\'' % 0x110000)
61
Walter Dörwald28256f22003-01-19 16:59:20 +000062 def test_repr(self):
63 if not sys.platform.startswith('java'):
64 # Test basic sanity of repr()
65 self.assertEqual(repr(u'abc'), "u'abc'")
66 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
67 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
68 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
69 self.assertEqual(repr(u'\\'), "u'\\\\'")
70 self.assertEqual(repr(u'\n'), "u'\\n'")
71 self.assertEqual(repr(u'\r'), "u'\\r'")
72 self.assertEqual(repr(u'\t'), "u'\\t'")
73 self.assertEqual(repr(u'\b'), "u'\\x08'")
74 self.assertEqual(repr(u"'\""), """u'\\'"'""")
75 self.assertEqual(repr(u"'\""), """u'\\'"'""")
76 self.assertEqual(repr(u"'"), '''u"'"''')
77 self.assertEqual(repr(u'"'), """u'"'""")
78 latin1repr = (
79 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
80 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
81 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
82 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
83 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
84 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
85 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
86 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
87 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
88 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
89 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
90 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
91 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
92 "\\xfe\\xff'")
93 testrepr = repr(u''.join(map(unichr, xrange(256))))
94 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +000095 # Test repr works on wide unicode escapes without overflow.
96 self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
97 repr(u"\U00010000" * 39 + u"\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +000098
Guido van Rossum49d6b072006-08-17 21:11:47 +000099 def test_iterators(self):
100 # Make sure unicode objects have an __iter__ method
101 it = u"\u1111\u2222\u3333".__iter__()
102 self.assertEqual(it.next(), u"\u1111")
103 self.assertEqual(it.next(), u"\u2222")
104 self.assertEqual(it.next(), u"\u3333")
105 self.assertRaises(StopIteration, it.next)
106
Walter Dörwald28256f22003-01-19 16:59:20 +0000107 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000108 string_tests.CommonTest.test_count(self)
109 # check mixed argument types
110 self.checkequalnofix(3, 'aaa', 'count', u'a')
111 self.checkequalnofix(0, 'aaa', 'count', u'b')
112 self.checkequalnofix(3, u'aaa', 'count', 'a')
113 self.checkequalnofix(0, u'aaa', 'count', 'b')
114 self.checkequalnofix(0, u'aaa', 'count', 'b')
115 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
116 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
117 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
118 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000119
Walter Dörwald28256f22003-01-19 16:59:20 +0000120 def test_find(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000121 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
122 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
123 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000124
Walter Dörwald28256f22003-01-19 16:59:20 +0000125 self.assertRaises(TypeError, u'hello'.find)
126 self.assertRaises(TypeError, u'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000127
Walter Dörwald28256f22003-01-19 16:59:20 +0000128 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000129 string_tests.CommonTest.test_rfind(self)
130 # check mixed argument types
131 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
132 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
133 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000134
Walter Dörwald28256f22003-01-19 16:59:20 +0000135 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000136 string_tests.CommonTest.test_index(self)
137 # check mixed argument types
138 for (t1, t2) in ((str, unicode), (unicode, str)):
139 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
140 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
141 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
142 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
143 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
144 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
145 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
146 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000147
Walter Dörwald28256f22003-01-19 16:59:20 +0000148 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000149 string_tests.CommonTest.test_rindex(self)
150 # check mixed argument types
151 for (t1, t2) in ((str, unicode), (unicode, str)):
152 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
153 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
154 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
155 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000156
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000157 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
158 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
159 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
160 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
161 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000162
Walter Dörwald28256f22003-01-19 16:59:20 +0000163 def test_translate(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000164 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
165 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
166 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
167 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
168 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
Walter Dörwaldcd736e72004-02-05 17:36:00 +0000169 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000170
Walter Dörwald28256f22003-01-19 16:59:20 +0000171 self.assertRaises(TypeError, u'hello'.translate)
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000172 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000173
Walter Dörwald28256f22003-01-19 16:59:20 +0000174 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000175 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000176
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000177 # Mixed arguments
178 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
179 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
180 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000181
Walter Dörwald28256f22003-01-19 16:59:20 +0000182 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000183 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000184
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000185 # mixed arguments
186 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
187 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
188 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
189 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
190 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
191 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
192 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
Marc-André Lemburge5034372000-08-08 08:04:29 +0000193
Walter Dörwald28256f22003-01-19 16:59:20 +0000194 def test_strip(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000195 string_tests.CommonTest.test_strip(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000196 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000197
Walter Dörwald28256f22003-01-19 16:59:20 +0000198 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000199 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000200
Walter Dörwald28256f22003-01-19 16:59:20 +0000201 # method call forwarded from str implementation because of unicode argument
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000202 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
Walter Dörwald28256f22003-01-19 16:59:20 +0000203 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000204
Walter Dörwald28256f22003-01-19 16:59:20 +0000205 def test_comparison(self):
206 # Comparisons:
207 self.assertEqual(u'abc', 'abc')
208 self.assertEqual('abc', u'abc')
209 self.assertEqual(u'abc', u'abc')
210 self.assert_(u'abcd' > 'abc')
211 self.assert_('abcd' > u'abc')
212 self.assert_(u'abcd' > u'abc')
213 self.assert_(u'abc' < 'abcd')
214 self.assert_('abc' < u'abcd')
215 self.assert_(u'abc' < u'abcd')
216
217 if 0:
218 # Move these tests to a Unicode collation module test...
219 # Testing UTF-16 code point order comparisons...
220
221 # No surrogates, no fixup required.
222 self.assert_(u'\u0061' < u'\u20ac')
223 # Non surrogate below surrogate value, no fixup required
224 self.assert_(u'\u0061' < u'\ud800\udc02')
225
226 # Non surrogate above surrogate value, fixup required
227 def test_lecmp(s, s2):
228 self.assert_(s < s2)
229
230 def test_fixup(s):
231 s2 = u'\ud800\udc01'
232 test_lecmp(s, s2)
233 s2 = u'\ud900\udc01'
234 test_lecmp(s, s2)
235 s2 = u'\uda00\udc01'
236 test_lecmp(s, s2)
237 s2 = u'\udb00\udc01'
238 test_lecmp(s, s2)
239 s2 = u'\ud800\udd01'
240 test_lecmp(s, s2)
241 s2 = u'\ud900\udd01'
242 test_lecmp(s, s2)
243 s2 = u'\uda00\udd01'
244 test_lecmp(s, s2)
245 s2 = u'\udb00\udd01'
246 test_lecmp(s, s2)
247 s2 = u'\ud800\ude01'
248 test_lecmp(s, s2)
249 s2 = u'\ud900\ude01'
250 test_lecmp(s, s2)
251 s2 = u'\uda00\ude01'
252 test_lecmp(s, s2)
253 s2 = u'\udb00\ude01'
254 test_lecmp(s, s2)
255 s2 = u'\ud800\udfff'
256 test_lecmp(s, s2)
257 s2 = u'\ud900\udfff'
258 test_lecmp(s, s2)
259 s2 = u'\uda00\udfff'
260 test_lecmp(s, s2)
261 s2 = u'\udb00\udfff'
262 test_lecmp(s, s2)
263
264 test_fixup(u'\ue000')
265 test_fixup(u'\uff61')
266
267 # Surrogates on both sides, no fixup required
268 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
269
Walter Dörwald28256f22003-01-19 16:59:20 +0000270 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000271 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
272 self.checkequalnofix(False, u'\u1FFc', 'islower')
Walter Dörwald28256f22003-01-19 16:59:20 +0000273
274 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000275 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
276 if not sys.platform.startswith('java'):
277 self.checkequalnofix(False, u'\u1FFc', 'isupper')
Walter Dörwald28256f22003-01-19 16:59:20 +0000278
279 def test_istitle(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000280 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
281 self.checkequalnofix(True, u'\u1FFc', 'istitle')
282 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000283
284 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000285 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
286 self.checkequalnofix(True, u'\u2000', 'isspace')
287 self.checkequalnofix(True, u'\u200a', 'isspace')
288 self.checkequalnofix(False, u'\u2014', 'isspace')
Walter Dörwald28256f22003-01-19 16:59:20 +0000289
290 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000291 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
292 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
Walter Dörwald28256f22003-01-19 16:59:20 +0000293
294 def test_isdecimal(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000295 self.checkequalnofix(False, u'', 'isdecimal')
296 self.checkequalnofix(False, u'a', 'isdecimal')
297 self.checkequalnofix(True, u'0', 'isdecimal')
298 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
299 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
300 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
301 self.checkequalnofix(True, u'0123456789', 'isdecimal')
302 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000303
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000304 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000305
306 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000307 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
308 self.checkequalnofix(True, u'\u2460', 'isdigit')
309 self.checkequalnofix(False, u'\xbc', 'isdigit')
310 self.checkequalnofix(True, u'\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000311
312 def test_isnumeric(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000313 self.checkequalnofix(False, u'', 'isnumeric')
314 self.checkequalnofix(False, u'a', 'isnumeric')
315 self.checkequalnofix(True, u'0', 'isnumeric')
316 self.checkequalnofix(True, u'\u2460', 'isnumeric')
317 self.checkequalnofix(True, u'\xbc', 'isnumeric')
318 self.checkequalnofix(True, u'\u0660', 'isnumeric')
319 self.checkequalnofix(True, u'0123456789', 'isnumeric')
320 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000321
322 self.assertRaises(TypeError, u"abc".isnumeric, 42)
323
Walter Dörwald28256f22003-01-19 16:59:20 +0000324 def test_contains(self):
325 # Testing Unicode contains method
326 self.assert_('a' in u'abdb')
327 self.assert_('a' in u'bdab')
328 self.assert_('a' in u'bdaba')
329 self.assert_('a' in u'bdba')
330 self.assert_('a' in u'bdba')
331 self.assert_(u'a' in u'bdba')
332 self.assert_(u'a' not in u'bdb')
333 self.assert_(u'a' not in 'bdb')
334 self.assert_(u'a' in 'bdba')
335 self.assert_(u'a' in ('a',1,None))
336 self.assert_(u'a' in (1,None,'a'))
337 self.assert_(u'a' in (1,None,u'a'))
338 self.assert_('a' in ('a',1,None))
339 self.assert_('a' in (1,None,'a'))
340 self.assert_('a' in (1,None,u'a'))
341 self.assert_('a' not in ('x',1,u'y'))
342 self.assert_('a' not in ('x',1,None))
343 self.assert_(u'abcd' not in u'abcxxxx')
344 self.assert_(u'ab' in u'abcd')
345 self.assert_('ab' in u'abc')
346 self.assert_(u'ab' in 'abc')
347 self.assert_(u'ab' in (1,None,u'ab'))
348 self.assert_(u'' in u'abc')
349 self.assert_('' in u'abc')
350
351 # If the following fails either
352 # the contains operator does not propagate UnicodeErrors or
353 # someone has changed the default encoding
354 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
355
356 self.assert_(u'' in '')
357 self.assert_('' in u'')
358 self.assert_(u'' in u'')
359 self.assert_(u'' in 'abc')
360 self.assert_('' in u'abc')
361 self.assert_(u'' in u'abc')
362 self.assert_(u'\0' not in 'abc')
363 self.assert_('\0' not in u'abc')
364 self.assert_(u'\0' not in u'abc')
365 self.assert_(u'\0' in '\0abc')
366 self.assert_('\0' in u'\0abc')
367 self.assert_(u'\0' in u'\0abc')
368 self.assert_(u'\0' in 'abc\0')
369 self.assert_('\0' in u'abc\0')
370 self.assert_(u'\0' in u'abc\0')
371 self.assert_(u'a' in '\0abc')
372 self.assert_('a' in u'\0abc')
373 self.assert_(u'a' in u'\0abc')
374 self.assert_(u'asdf' in 'asdf')
375 self.assert_('asdf' in u'asdf')
376 self.assert_(u'asdf' in u'asdf')
377 self.assert_(u'asdf' not in 'asd')
378 self.assert_('asdf' not in u'asd')
379 self.assert_(u'asdf' not in u'asd')
380 self.assert_(u'asdf' not in '')
381 self.assert_('asdf' not in u'')
382 self.assert_(u'asdf' not in u'')
383
384 self.assertRaises(TypeError, u"abc".__contains__)
385
386 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000387 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000388 # Testing Unicode formatting strings...
389 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
390 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
391 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
392 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
393 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
394 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000395 if not sys.platform.startswith('java'):
396 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
397 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000398 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000399
Walter Dörwald43440a62003-03-31 18:07:50 +0000400 self.assertEqual(u'%c' % 0x1234, u'\u1234')
Walter Dörwald44f527f2003-04-02 16:37:24 +0000401 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
Walter Dörwald28256f22003-01-19 16:59:20 +0000402
403 # formatting jobs delegated from the string implementation:
404 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
405 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
406 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
407 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
408 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
409 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
410 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
411 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
412 self.assertEqual('...%s...' % u"abc", u'...abc...')
413 self.assertEqual('%*s' % (5,u'abc',), u' abc')
414 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
415 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
416 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
417 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000418 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
Walter Dörwald43440a62003-03-31 18:07:50 +0000419 self.assertEqual('%c' % u'a', u'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000420 class Wrapper:
421 def __str__(self):
422 return u'\u1234'
423 self.assertEqual('%s' % Wrapper(), u'\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +0000424
Thomas Wouters477c8d52006-05-27 19:21:47 +0000425 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000426 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000427 # should not format with a comma, but always with C locale
428 self.assertEqual(u'1.0', u'%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +0000429
Walter Dörwald28256f22003-01-19 16:59:20 +0000430 def test_constructor(self):
431 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
432
433 self.assertEqual(
434 unicode(u'unicode remains unicode'),
435 u'unicode remains unicode'
436 )
437
438 class UnicodeSubclass(unicode):
Marc-André Lemburg79f57832002-12-29 19:44:06 +0000439 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +0000440
Walter Dörwald28256f22003-01-19 16:59:20 +0000441 self.assertEqual(
442 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
443 u'unicode subclass becomes unicode'
444 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000445
Walter Dörwald28256f22003-01-19 16:59:20 +0000446 self.assertEqual(
447 unicode('strings are converted to unicode'),
448 u'strings are converted to unicode'
449 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000450
Walter Dörwald28256f22003-01-19 16:59:20 +0000451 class UnicodeCompat:
452 def __init__(self, x):
453 self.x = x
454 def __unicode__(self):
455 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000456
Walter Dörwald28256f22003-01-19 16:59:20 +0000457 self.assertEqual(
458 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
459 u'__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000460
Walter Dörwald28256f22003-01-19 16:59:20 +0000461 class StringCompat:
462 def __init__(self, x):
463 self.x = x
464 def __str__(self):
465 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000466
Walter Dörwald28256f22003-01-19 16:59:20 +0000467 self.assertEqual(
468 unicode(StringCompat('__str__ compatible objects are recognized')),
469 u'__str__ compatible objects are recognized'
470 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000471
Walter Dörwald28256f22003-01-19 16:59:20 +0000472 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000473
Walter Dörwald28256f22003-01-19 16:59:20 +0000474 o = StringCompat('unicode(obj) is compatible to str()')
475 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
476 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000477
Marc-André Lemburgd25c6502004-07-23 16:13:25 +0000478 # %-formatting and .__unicode__()
479 self.assertEqual(u'%s' %
480 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
481 u"u'%s' % obj uses obj.__unicode__()")
482 self.assertEqual(u'%s' %
483 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
484 u"u'%s' % obj falls back to obj.__str__()")
485
Guido van Rossume2a383d2007-01-15 16:59:06 +0000486 for obj in (123, 123.45, 123):
Walter Dörwald28256f22003-01-19 16:59:20 +0000487 self.assertEqual(unicode(obj), unicode(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000488
Walter Dörwald28256f22003-01-19 16:59:20 +0000489 # unicode(obj, encoding, error) tests (this maps to
490 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000491
Walter Dörwald28256f22003-01-19 16:59:20 +0000492 if not sys.platform.startswith('java'):
493 self.assertRaises(
494 TypeError,
495 unicode,
496 u'decoding unicode is not supported',
497 'utf-8',
498 'strict'
499 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000500
Walter Dörwald28256f22003-01-19 16:59:20 +0000501 self.assertEqual(
502 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
503 u'strings are decoded to unicode'
504 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000505
Walter Dörwald28256f22003-01-19 16:59:20 +0000506 if not sys.platform.startswith('java'):
507 self.assertEqual(
508 unicode(
509 buffer('character buffers are decoded to unicode'),
510 'utf-8',
511 'strict'
512 ),
513 u'character buffers are decoded to unicode'
514 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000515
Walter Dörwald28256f22003-01-19 16:59:20 +0000516 self.assertRaises(TypeError, unicode, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000517
Walter Dörwald28256f22003-01-19 16:59:20 +0000518 def test_codecs_utf7(self):
519 utfTests = [
520 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
521 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
522 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
523 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
524 (u'+', '+-'),
525 (u'+-', '+--'),
526 (u'+?', '+-?'),
527 (u'\?', '+AFw?'),
528 (u'+?', '+-?'),
529 (ur'\\?', '+AFwAXA?'),
530 (ur'\\\?', '+AFwAXABc?'),
531 (ur'++--', '+-+---')
532 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000533
Walter Dörwald28256f22003-01-19 16:59:20 +0000534 for (x, y) in utfTests:
535 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000536
Walter Dörwald28256f22003-01-19 16:59:20 +0000537 # surrogates not supported
538 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000539
Walter Dörwald28256f22003-01-19 16:59:20 +0000540 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000541
Walter Dörwald28256f22003-01-19 16:59:20 +0000542 def test_codecs_utf8(self):
543 self.assertEqual(u''.encode('utf-8'), '')
544 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
545 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
546 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
547 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
548 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
549 self.assertEqual(
550 (u'\ud800\udc02'*1000).encode('utf-8'),
551 '\xf0\x90\x80\x82'*1000
552 )
553 self.assertEqual(
554 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
555 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
556 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
557 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
558 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
559 u' Nunstuck git und'.encode('utf-8'),
560 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
561 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
562 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
563 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
564 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
565 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
566 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
567 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
568 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
569 '\xe3\x80\x8cWenn ist das Nunstuck git und'
570 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000571
Walter Dörwald28256f22003-01-19 16:59:20 +0000572 # UTF-8 specific decoding tests
573 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
574 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
575 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000576
Walter Dörwald28256f22003-01-19 16:59:20 +0000577 # Other possible utf-8 test cases:
578 # * strict decoding testing for all of the
579 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000580
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000581 def test_codecs_idna(self):
582 # Test whether trailing dot is preserved
583 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
584
Walter Dörwald28256f22003-01-19 16:59:20 +0000585 def test_codecs_errors(self):
586 # Error handling (encoding)
587 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
588 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
589 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
590 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000591
Walter Dörwald28256f22003-01-19 16:59:20 +0000592 # Error handling (decoding)
593 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
594 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
595 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
596 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000597
Walter Dörwald28256f22003-01-19 16:59:20 +0000598 # Error handling (unknown character names)
599 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000600
Walter Dörwald28256f22003-01-19 16:59:20 +0000601 # Error handling (truncated escape sequence)
602 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000603
Walter Dörwald28256f22003-01-19 16:59:20 +0000604 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
605 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
606 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
607 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
608 # executes PyUnicode_Encode()
609 import imp
610 self.assertRaises(
611 ImportError,
612 imp.find_module,
613 "non-existing module",
614 [u"non-existing dir"]
615 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000616
Walter Dörwald28256f22003-01-19 16:59:20 +0000617 # Error handling (wrong arguments)
618 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000619
Walter Dörwald28256f22003-01-19 16:59:20 +0000620 # Error handling (PyUnicode_EncodeDecimal())
621 self.assertRaises(UnicodeError, int, u"\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +0000622
Walter Dörwald28256f22003-01-19 16:59:20 +0000623 def test_codecs(self):
624 # Encoding
625 self.assertEqual(u'hello'.encode('ascii'), 'hello')
626 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
627 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
628 self.assertEqual(u'hello'.encode('utf8'), 'hello')
629 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
630 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
631 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +0000632
Walter Dörwald28256f22003-01-19 16:59:20 +0000633 # Roundtrip safety for BMP (just the first 1024 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000634 for c in xrange(1024):
635 u = unichr(c)
636 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
637 'utf-16-be', 'raw_unicode_escape',
638 'unicode_escape', 'unicode_internal'):
639 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +0000640
Walter Dörwald28256f22003-01-19 16:59:20 +0000641 # Roundtrip safety for BMP (just the first 256 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000642 for c in xrange(256):
643 u = unichr(c)
644 for encoding in ('latin-1',):
645 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000646
Walter Dörwald28256f22003-01-19 16:59:20 +0000647 # Roundtrip safety for BMP (just the first 128 chars)
Hye-Shik Chang835b2432005-12-17 04:38:31 +0000648 for c in xrange(128):
649 u = unichr(c)
650 for encoding in ('ascii',):
651 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000652
Walter Dörwald28256f22003-01-19 16:59:20 +0000653 # Roundtrip safety for non-BMP (just a few chars)
654 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
655 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
656 #'raw_unicode_escape',
657 'unicode_escape', 'unicode_internal'):
658 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000659
Walter Dörwald28256f22003-01-19 16:59:20 +0000660 # UTF-8 must be roundtrip safe for all UCS-2 code points
661 # This excludes surrogates: in the full range, there would be
662 # a surrogate pair (\udbff\udc00), which gets converted back
663 # to a non-BMP character (\U0010fc00)
664 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
665 for encoding in ('utf-8',):
666 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000667
Walter Dörwald28256f22003-01-19 16:59:20 +0000668 def test_codecs_charmap(self):
669 # 0-127
670 s = ''.join(map(chr, xrange(128)))
671 for encoding in (
672 'cp037', 'cp1026',
673 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
674 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
675 'cp863', 'cp865', 'cp866',
676 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
677 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
678 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
679 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000680
Walter Dörwald28256f22003-01-19 16:59:20 +0000681 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
682 'cp1256', 'cp1257', 'cp1258',
683 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000684
Walter Dörwald28256f22003-01-19 16:59:20 +0000685 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
686 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000687
Walter Dörwald28256f22003-01-19 16:59:20 +0000688 ### These have undefined mappings:
689 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000690
Walter Dörwald28256f22003-01-19 16:59:20 +0000691 ### These fail the round-trip:
692 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +0000693
Walter Dörwald28256f22003-01-19 16:59:20 +0000694 ):
695 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000696
Walter Dörwald28256f22003-01-19 16:59:20 +0000697 # 128-255
698 s = ''.join(map(chr, xrange(128, 256)))
699 for encoding in (
700 'cp037', 'cp1026',
701 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
702 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
703 'cp863', 'cp865', 'cp866',
704 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
705 'iso8859_2', 'iso8859_4', 'iso8859_5',
706 'iso8859_9', 'koi8_r', 'latin_1',
707 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000708
Walter Dörwald28256f22003-01-19 16:59:20 +0000709 ### These have undefined mappings:
710 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
711 #'cp1256', 'cp1257', 'cp1258',
712 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
713 #'iso8859_3', 'iso8859_6', 'iso8859_7',
714 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000715
Walter Dörwald28256f22003-01-19 16:59:20 +0000716 ### These fail the round-trip:
717 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +0000718
Walter Dörwald28256f22003-01-19 16:59:20 +0000719 ):
720 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000721
Walter Dörwald28256f22003-01-19 16:59:20 +0000722 def test_concatenation(self):
723 self.assertEqual((u"abc" u"def"), u"abcdef")
724 self.assertEqual(("abc" u"def"), u"abcdef")
725 self.assertEqual((u"abc" "def"), u"abcdef")
726 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
727 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +0000728
Walter Dörwald28256f22003-01-19 16:59:20 +0000729 def test_printing(self):
730 class BitBucket:
731 def write(self, text):
732 pass
Fred Drake004d5e62000-10-23 17:22:08 +0000733
Walter Dörwald28256f22003-01-19 16:59:20 +0000734 out = BitBucket()
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000735 print(u'abc', file=out)
736 print(u'abc', u'def', file=out)
737 print(u'abc', 'def', file=out)
738 print('abc', u'def', file=out)
739 print(u'abc\n', file=out)
740 print(u'abc\n', end=' ', file=out)
741 print(u'abc\n', end=' ', file=out)
742 print(u'def\n', file=out)
743 print(u'def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +0000744
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +0000745 def test_ucs4(self):
746 if sys.maxunicode == 0xFFFF:
747 return
748 x = u'\U00100000'
749 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
750 self.assertEqual(x, y)
751
Brett Cannonc3647ac2005-04-26 03:45:26 +0000752 def test_conversion(self):
753 # Make sure __unicode__() works properly
754 class Foo0:
755 def __str__(self):
756 return "foo"
757
758 class Foo1:
759 def __unicode__(self):
760 return u"foo"
761
762 class Foo2(object):
763 def __unicode__(self):
764 return u"foo"
765
766 class Foo3(object):
767 def __unicode__(self):
768 return "foo"
769
770 class Foo4(str):
771 def __unicode__(self):
772 return "foo"
773
774 class Foo5(unicode):
775 def __unicode__(self):
776 return "foo"
777
778 class Foo6(str):
779 def __str__(self):
780 return "foos"
781
782 def __unicode__(self):
783 return u"foou"
784
785 class Foo7(unicode):
786 def __str__(self):
787 return "foos"
788 def __unicode__(self):
789 return u"foou"
790
791 class Foo8(unicode):
792 def __new__(cls, content=""):
793 return unicode.__new__(cls, 2*content)
794 def __unicode__(self):
795 return self
796
797 class Foo9(unicode):
798 def __str__(self):
799 return "string"
800 def __unicode__(self):
801 return "not unicode"
802
803 self.assertEqual(unicode(Foo0()), u"foo")
804 self.assertEqual(unicode(Foo1()), u"foo")
805 self.assertEqual(unicode(Foo2()), u"foo")
806 self.assertEqual(unicode(Foo3()), u"foo")
807 self.assertEqual(unicode(Foo4("bar")), u"foo")
808 self.assertEqual(unicode(Foo5("bar")), u"foo")
809 self.assertEqual(unicode(Foo6("bar")), u"foou")
810 self.assertEqual(unicode(Foo7("bar")), u"foou")
811 self.assertEqual(unicode(Foo8("foo")), u"foofoo")
812 self.assertEqual(str(Foo9("foo")), "string")
813 self.assertEqual(unicode(Foo9("foo")), u"not unicode")
814
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000815 def test_unicode_repr(self):
816 class s1:
817 def __repr__(self):
818 return '\\n'
819
820 class s2:
821 def __repr__(self):
822 return u'\\n'
823
824 self.assertEqual(repr(s1()), '\\n')
825 self.assertEqual(repr(s2()), '\\n')
826
827
828
829
830
Walter Dörwald28256f22003-01-19 16:59:20 +0000831def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000832 test_support.run_unittest(UnicodeTest)
Barry Warsaw817918c2002-08-06 16:58:21 +0000833
Walter Dörwald28256f22003-01-19 16:59:20 +0000834if __name__ == "__main__":
835 test_main()