blob: 85c358669ee3a45bb53046b21d2ca9be6a1b46fd [file] [log] [blame]
Martin v. Löwisa729daf2002-08-04 17:28:33 +00001# -*- coding: iso-8859-1 -*-
Guido van Rossuma831cac2000-03-10 23:23:21 +00002""" Test script for the Unicode implementation.
3
Guido van Rossuma831cac2000-03-10 23:23:21 +00004Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
Marc-André Lemburg36619082001-01-17 19:11:13 +00008"""#"
Walter Dörwald28256f22003-01-19 16:59:20 +00009import unittest, test.test_support
10import sys, string, codecs
Guido van Rossuma831cac2000-03-10 23:23:21 +000011
Walter Dörwald28256f22003-01-19 16:59:20 +000012class UnicodeTest(unittest.TestCase):
Guido van Rossume4874ae2001-09-21 15:36:41 +000013
Walter Dörwald28256f22003-01-19 16:59:20 +000014 def test_repr(self):
15 if not sys.platform.startswith('java'):
16 # Test basic sanity of repr()
17 self.assertEqual(repr(u'abc'), "u'abc'")
18 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
19 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
20 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
21 self.assertEqual(repr(u'\\'), "u'\\\\'")
22 self.assertEqual(repr(u'\n'), "u'\\n'")
23 self.assertEqual(repr(u'\r'), "u'\\r'")
24 self.assertEqual(repr(u'\t'), "u'\\t'")
25 self.assertEqual(repr(u'\b'), "u'\\x08'")
26 self.assertEqual(repr(u"'\""), """u'\\'"'""")
27 self.assertEqual(repr(u"'\""), """u'\\'"'""")
28 self.assertEqual(repr(u"'"), '''u"'"''')
29 self.assertEqual(repr(u'"'), """u'"'""")
30 latin1repr = (
31 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
32 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
33 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
34 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
35 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
36 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
37 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
38 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
39 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
40 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
41 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
42 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
43 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
44 "\\xfe\\xff'")
45 testrepr = repr(u''.join(map(unichr, xrange(256))))
46 self.assertEqual(testrepr, latin1repr)
47
48 def checkmethod(self, method, input, output, *args):
Guido van Rossuma831cac2000-03-10 23:23:21 +000049 f = getattr(input, method)
Walter Dörwald28256f22003-01-19 16:59:20 +000050 value = f(*args)
51 self.assertEqual(output, value)
52 self.assert_(type(output) is type(value))
53
Walter Dörwald2ee4be02002-04-17 21:34:05 +000054 # if the original is returned make sure that
55 # this doesn't happen with subclasses
56 if value is input:
57 class usub(unicode):
58 def __repr__(self):
59 return 'usub(%r)' % unicode.__repr__(self)
60 input = usub(input)
Walter Dörwald28256f22003-01-19 16:59:20 +000061 f = getattr(input, method)
62 value = f(*args)
63 self.assertEqual(output, value)
64 self.assert_(input is not value)
Guido van Rossuma831cac2000-03-10 23:23:21 +000065
Walter Dörwald28256f22003-01-19 16:59:20 +000066 def test_capitalize(self):
67 self.checkmethod('capitalize', u' hello ', u' hello ')
68 self.checkmethod('capitalize', u'Hello ', u'Hello ')
69 self.checkmethod('capitalize', u'hello ', u'Hello ')
70 self.checkmethod('capitalize', u'aaaa', u'Aaaa')
71 self.checkmethod('capitalize', u'AaAa', u'Aaaa')
Guido van Rossuma831cac2000-03-10 23:23:21 +000072
Walter Dörwald28256f22003-01-19 16:59:20 +000073 self.assertRaises(TypeError, u'hello'.capitalize, 42)
Marc-André Lemburg3a645e42001-01-16 11:54:12 +000074
Walter Dörwald28256f22003-01-19 16:59:20 +000075 def test_count(self):
76 self.checkmethod('count', u'aaa', 3, u'a')
77 self.checkmethod('count', u'aaa', 0, u'b')
78 self.checkmethod('count', 'aaa', 3, u'a')
79 self.checkmethod('count', 'aaa', 0, u'b')
80 self.checkmethod('count', u'aaa', 3, 'a')
81 self.checkmethod('count', u'aaa', 0, 'b')
Walter Dörwald4f046e22003-02-10 17:51:03 +000082 self.checkmethod('count', u'aaa', 0, 'b')
83 self.checkmethod('count', u'aaa', 1, 'a', -1)
84 self.checkmethod('count', u'aaa', 3, 'a', -10)
85 self.checkmethod('count', u'aaa', 2, 'a', 0, -1)
86 self.checkmethod('count', u'aaa', 0, 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +000087
Walter Dörwald28256f22003-01-19 16:59:20 +000088 self.assertRaises(TypeError, u'hello'.count)
Walter Dörwald4f046e22003-02-10 17:51:03 +000089 self.assertRaises(TypeError, u'hello'.count, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +000090
Walter Dörwald28256f22003-01-19 16:59:20 +000091 def test_title(self):
92 self.checkmethod('title', u' hello ', u' Hello ')
93 self.checkmethod('title', u'Hello ', u'Hello ')
94 self.checkmethod('title', u'hello ', u'Hello ')
95 self.checkmethod('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
96 self.checkmethod('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
97 self.checkmethod('title', u"getInt", u'Getint')
Guido van Rossuma831cac2000-03-10 23:23:21 +000098
Walter Dörwald74640242003-02-10 17:44:16 +000099 self.assertRaises(TypeError, u'hello'.title, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000100
Walter Dörwald28256f22003-01-19 16:59:20 +0000101 def test_find(self):
102 self.checkmethod('find', u'abcdefghiabc', 0, u'abc')
103 self.checkmethod('find', u'abcdefghiabc', 9, u'abc', 1)
104 self.checkmethod('find', u'abcdefghiabc', -1, u'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000105
Walter Dörwald28256f22003-01-19 16:59:20 +0000106 self.assertRaises(TypeError, u'hello'.find)
107 self.assertRaises(TypeError, u'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000108
Walter Dörwald28256f22003-01-19 16:59:20 +0000109 def test_rfind(self):
110 self.checkmethod('rfind', u'abcdefghiabc', 9, u'abc')
111 self.checkmethod('rfind', 'abcdefghiabc', 9, u'abc')
112 self.checkmethod('rfind', 'abcdefghiabc', 12, u'')
113 self.checkmethod('rfind', u'abcdefghiabc', 12, '')
114 self.checkmethod('rfind', u'abcdefghiabc', 12, u'')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000115
Walter Dörwald28256f22003-01-19 16:59:20 +0000116 self.assertRaises(TypeError, u'hello'.rfind)
117 self.assertRaises(TypeError, u'hello'.rfind, 42)
Guido van Rossum8b264542000-12-19 02:22:31 +0000118
Walter Dörwald28256f22003-01-19 16:59:20 +0000119 def test_index(self):
120 self.checkmethod('index', u'abcdefghiabc', 0, u'')
121 self.checkmethod('index', u'abcdefghiabc', 3, u'def')
122 self.checkmethod('index', u'abcdefghiabc', 0, u'abc')
123 self.checkmethod('index', u'abcdefghiabc', 9, u'abc', 1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000124
Walter Dörwald28256f22003-01-19 16:59:20 +0000125 self.assertRaises(ValueError, u'abcdefghiabc'.index, u'hib')
126 self.assertRaises(ValueError, u'abcdefghiab'.index, u'abc', 1)
127 self.assertRaises(ValueError, u'abcdefghi'.index, u'ghi', 8)
128 self.assertRaises(ValueError, u'abcdefghi'.index, u'ghi', -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000129
Walter Dörwald28256f22003-01-19 16:59:20 +0000130 self.assertRaises(TypeError, u'hello'.index)
131 self.assertRaises(TypeError, u'hello'.index, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000132
Walter Dörwald28256f22003-01-19 16:59:20 +0000133 def test_rindex(self):
134 self.checkmethod('rindex', u'abcdefghiabc', 12, u'')
135 self.checkmethod('rindex', u'abcdefghiabc', 3, u'def')
136 self.checkmethod('rindex', u'abcdefghiabc', 9, u'abc')
137 self.checkmethod('rindex', u'abcdefghiabc', 0, u'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000138
Walter Dörwald28256f22003-01-19 16:59:20 +0000139 self.assertRaises(ValueError, u'abcdefghiabc'.rindex, u'hib')
140 self.assertRaises(ValueError, u'defghiabc'.rindex, u'def', 1)
141 self.assertRaises(ValueError, u'defghiabc'.rindex, u'abc', 0, -1)
142 self.assertRaises(ValueError, u'abcdefghi'.rindex, u'ghi', 0, 8)
143 self.assertRaises(ValueError, u'abcdefghi'.rindex, u'ghi', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000144
Walter Dörwald28256f22003-01-19 16:59:20 +0000145 self.assertRaises(TypeError, u'hello'.rindex)
146 self.assertRaises(TypeError, u'hello'.rindex, 42)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +0000147
Walter Dörwald28256f22003-01-19 16:59:20 +0000148 def test_lower(self):
149 self.checkmethod('lower', u'HeLLo', u'hello')
150 self.checkmethod('lower', u'hello', u'hello')
Walter Dörwaldde02bcb2002-04-22 17:42:37 +0000151
Walter Dörwald28256f22003-01-19 16:59:20 +0000152 self.assertRaises(TypeError, u"hello".lower, 42)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +0000153
Walter Dörwald28256f22003-01-19 16:59:20 +0000154 def test_upper(self):
155 self.checkmethod('upper', u'HeLLo', u'HELLO')
156 self.checkmethod('upper', u'HELLO', u'HELLO')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000157
Walter Dörwald28256f22003-01-19 16:59:20 +0000158 self.assertRaises(TypeError, u'hello'.upper, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000159
Walter Dörwald28256f22003-01-19 16:59:20 +0000160 def test_translate(self):
161 if 0:
162 transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
Guido van Rossuma831cac2000-03-10 23:23:21 +0000163
Walter Dörwald28256f22003-01-19 16:59:20 +0000164 self.checkmethod('maketrans', u'abc', transtable, u'xyz')
165 self.checkmethod('maketrans', u'abc', ValueError, u'xyzq')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000166
Walter Dörwald28256f22003-01-19 16:59:20 +0000167 self.checkmethod('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000168
Walter Dörwald28256f22003-01-19 16:59:20 +0000169 table = string.maketrans('a', u'A')
170 self.checkmethod('translate', u'abc', u'Abc', table)
171 self.checkmethod('translate', u'xyz', u'xyz', table)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000172
Walter Dörwald28256f22003-01-19 16:59:20 +0000173 self.checkmethod('translate', u"abababc", u'bbbc', {ord('a'):None})
174 self.checkmethod('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
175 self.checkmethod('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
176 self.checkmethod('translate', u"abababc", u'<i><i><i>c', {ord('a'):None, ord('b'):u'<i>'})
177 self.checkmethod('translate', u"abababc", u'c', {ord('a'):None, ord('b'):u''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000178
Walter Dörwald28256f22003-01-19 16:59:20 +0000179 self.assertRaises(TypeError, u'hello'.translate)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000180
Walter Dörwald28256f22003-01-19 16:59:20 +0000181 def test_split(self):
182 self.checkmethod(
183 'split',
184 u'this is the split function',
185 [u'this', u'is', u'the', u'split', u'function']
186 )
187 self.checkmethod('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
188 self.checkmethod('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
189 self.checkmethod('split', u'a b c d', [u'a', u'b c d'], None, 1)
190 self.checkmethod('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
191 self.checkmethod('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
192 self.checkmethod('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
193 self.checkmethod('split', u'a b c d', [u'a b c d'], None, 0)
194 self.checkmethod('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
195 self.checkmethod('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
196 self.checkmethod('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
197 self.checkmethod('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
198 self.checkmethod('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
199 self.checkmethod('split', u'endcase test', [u'endcase ', u''], u'test')
200 self.checkmethod('split', u'endcase test', [u'endcase ', u''], 'test')
201 self.checkmethod('split', 'endcase test', [u'endcase ', u''], u'test')
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000202
Walter Dörwald28256f22003-01-19 16:59:20 +0000203 self.assertRaises(TypeError, u"hello".split, 42, 42, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000204
Walter Dörwald28256f22003-01-19 16:59:20 +0000205 def test_join(self):
206 # join now works with any sequence type
207 class Sequence:
208 def __init__(self, seq): self.seq = seq
209 def __len__(self): return len(self.seq)
210 def __getitem__(self, i): return self.seq[i]
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000211
Walter Dörwald28256f22003-01-19 16:59:20 +0000212 self.checkmethod('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
213 self.checkmethod('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
214 self.checkmethod('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
215 self.checkmethod('join', u' ', u'w x y z', Sequence('wxyz'))
216 self.assertRaises(TypeError, u' '.join, 7)
217 self.assertRaises(TypeError, u' '.join, Sequence([7, u'hello', 123L]))
218 self.checkmethod('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
219 self.checkmethod('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
220 self.checkmethod('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
221 self.checkmethod('join', ' ', u'w x y z', Sequence(u'wxyz'))
222 self.assertRaises(TypeError, ' '.join, TypeError)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000223
Walter Dörwald28256f22003-01-19 16:59:20 +0000224 result = u''
225 for i in range(10):
226 if i > 0:
227 result = result + u':'
228 result = result + u'x'*10
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000229
Walter Dörwald28256f22003-01-19 16:59:20 +0000230 self.checkmethod('join', u':', result, [u'x' * 10] * 10)
231 self.checkmethod('join', u':', result, (u'x' * 10,) * 10)
Marc-André Lemburge5034372000-08-08 08:04:29 +0000232
Walter Dörwald28256f22003-01-19 16:59:20 +0000233 self.assertRaises(TypeError, u"hello".join)
Marc-André Lemburge5034372000-08-08 08:04:29 +0000234
Walter Dörwald28256f22003-01-19 16:59:20 +0000235 def test_strip(self):
236 self.checkmethod('strip', u' hello ', u'hello')
237 self.checkmethod('lstrip', u' hello ', u'hello ')
238 self.checkmethod('rstrip', u' hello ', u' hello')
239 self.checkmethod('strip', u'hello', u'hello')
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000240
Walter Dörwald28256f22003-01-19 16:59:20 +0000241 # strip/lstrip/rstrip with None arg
242 self.checkmethod('strip', u' hello ', u'hello', None)
243 self.checkmethod('lstrip', u' hello ', u'hello ', None)
244 self.checkmethod('rstrip', u' hello ', u' hello', None)
245 self.checkmethod('strip', u'hello', u'hello', None)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000246
Walter Dörwald28256f22003-01-19 16:59:20 +0000247 # strip/lstrip/rstrip with unicode arg
248 self.checkmethod('strip', u'xyzzyhelloxyzzy', u'hello', u'xyz')
249 self.checkmethod('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', u'xyz')
250 self.checkmethod('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', u'xyz')
251 self.checkmethod('strip', u'hello', u'hello', u'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000252
Walter Dörwald28256f22003-01-19 16:59:20 +0000253 # strip/lstrip/rstrip with str arg
254 self.checkmethod('strip', u'xyzzyhelloxyzzy', u'hello', 'xyz')
255 self.checkmethod('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', 'xyz')
256 self.checkmethod('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', 'xyz')
257 self.checkmethod('strip', u'hello', u'hello', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000258
Walter Dörwald28256f22003-01-19 16:59:20 +0000259 self.assertRaises(TypeError, u"hello".strip, 42, 42)
260 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000261
Walter Dörwald28256f22003-01-19 16:59:20 +0000262 def test_swapcase(self):
263 self.checkmethod('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000264
Walter Dörwald28256f22003-01-19 16:59:20 +0000265 self.assertRaises(TypeError, u"hello".swapcase, 42)
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000266
Walter Dörwald28256f22003-01-19 16:59:20 +0000267 def test_replace(self):
268 self.checkmethod('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
269 self.checkmethod('replace', u'one!two!three!', u'onetwothree', '!', '')
270 self.checkmethod('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
271 self.checkmethod('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
272 self.checkmethod('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
273 self.checkmethod('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
274 self.checkmethod('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
275 self.checkmethod('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
276 self.checkmethod('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
277 self.checkmethod('replace', u'abc', u'-a-b-c-', u'', u'-')
278 self.checkmethod('replace', u'abc', u'-a-b-c', u'', u'-', 3)
279 self.checkmethod('replace', u'abc', u'abc', u'', u'-', 0)
280 self.checkmethod('replace', u'abc', u'abc', u'ab', u'--', 0)
281 self.checkmethod('replace', u'abc', u'abc', u'xy', u'--')
282 self.checkmethod('replace', u'', u'', u'', u'')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000283
Walter Dörwald28256f22003-01-19 16:59:20 +0000284 # method call forwarded from str implementation because of unicode argument
285 self.checkmethod('replace', 'one!two!three!', u'one@two!three!', u'!', u'@', 1)
286 self.assertRaises(TypeError, 'replace'.replace, 42)
287 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000288
Walter Dörwald28256f22003-01-19 16:59:20 +0000289 self.assertRaises(TypeError, u"hello".replace)
290 self.assertRaises(TypeError, u"hello".replace, 42, u"h")
291 self.assertRaises(TypeError, u"hello".replace, u"h", 42)
Guido van Rossumd4d26842000-03-13 23:21:48 +0000292
Walter Dörwald28256f22003-01-19 16:59:20 +0000293 def test_startswith(self):
294 self.checkmethod('startswith', u'hello', True, u'he')
295 self.checkmethod('startswith', u'hello', True, u'hello')
296 self.checkmethod('startswith', u'hello', False, u'hello world')
297 self.checkmethod('startswith', u'hello', True, u'')
298 self.checkmethod('startswith', u'hello', False, u'ello')
299 self.checkmethod('startswith', u'hello', True, u'ello', 1)
300 self.checkmethod('startswith', u'hello', True, u'o', 4)
301 self.checkmethod('startswith', u'hello', False, u'o', 5)
302 self.checkmethod('startswith', u'hello', True, u'', 5)
303 self.checkmethod('startswith', u'hello', False, u'lo', 6)
304 self.checkmethod('startswith', u'helloworld', True, u'lowo', 3)
305 self.checkmethod('startswith', u'helloworld', True, u'lowo', 3, 7)
306 self.checkmethod('startswith', u'helloworld', False, u'lowo', 3, 6)
Marc-André Lemburg84625732000-06-13 12:05:36 +0000307
Walter Dörwald28256f22003-01-19 16:59:20 +0000308 self.assertRaises(TypeError, u"hello".startswith)
309 self.assertRaises(TypeError, u"hello".startswith, 42)
Marc-André Lemburg84625732000-06-13 12:05:36 +0000310
Walter Dörwald28256f22003-01-19 16:59:20 +0000311 def test_endswith(self):
312 self.checkmethod('endswith', u'hello', True, u'lo')
313 self.checkmethod('endswith', u'hello', False, u'he')
314 self.checkmethod('endswith', u'hello', True, u'')
315 self.checkmethod('endswith', u'hello', False, u'hello world')
316 self.checkmethod('endswith', u'helloworld', False, u'worl')
317 self.checkmethod('endswith', u'helloworld', True, u'worl', 3, 9)
318 self.checkmethod('endswith', u'helloworld', True, u'world', 3, 12)
319 self.checkmethod('endswith', u'helloworld', True, u'lowo', 1, 7)
320 self.checkmethod('endswith', u'helloworld', True, u'lowo', 2, 7)
321 self.checkmethod('endswith', u'helloworld', True, u'lowo', 3, 7)
322 self.checkmethod('endswith', u'helloworld', False, u'lowo', 4, 7)
323 self.checkmethod('endswith', u'helloworld', False, u'lowo', 3, 8)
324 self.checkmethod('endswith', u'ab', False, u'ab', 0, 1)
325 self.checkmethod('endswith', u'ab', False, u'ab', 0, 0)
326 self.checkmethod('endswith', 'helloworld', True, u'd')
327 self.checkmethod('endswith', 'helloworld', False, u'l')
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000328
Walter Dörwald28256f22003-01-19 16:59:20 +0000329 self.assertRaises(TypeError, u"hello".endswith)
330 self.assertRaises(TypeError, u"hello".endswith, 42)
331
332 def test_expandtabs(self):
333 self.checkmethod('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi')
334 self.checkmethod('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 8)
335 self.checkmethod('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 4)
336 self.checkmethod('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab def\ng hi', 4)
337 self.checkmethod('expandtabs', u'abc\r\nab\r\ndef\ng\r\nhi', u'abc\r\nab\r\ndef\ng\r\nhi', 4)
338
339 self.assertRaises(TypeError, u"hello".expandtabs, 42, 42)
340
341 def test_capwords(self):
342 if 0:
343 self.checkmethod('capwords', u'abc def ghi', u'Abc Def Ghi')
344 self.checkmethod('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
345 self.checkmethod('capwords', u'abc\t def \nghi', u'Abc Def Ghi')
346
347 def test_zfill(self):
348 self.checkmethod('zfill', u'123', u'123', 2)
349 self.checkmethod('zfill', u'123', u'123', 3)
350 self.checkmethod('zfill', u'123', u'0123', 4)
351 self.checkmethod('zfill', u'+123', u'+123', 3)
352 self.checkmethod('zfill', u'+123', u'+123', 4)
353 self.checkmethod('zfill', u'+123', u'+0123', 5)
354 self.checkmethod('zfill', u'-123', u'-123', 3)
355 self.checkmethod('zfill', u'-123', u'-123', 4)
356 self.checkmethod('zfill', u'-123', u'-0123', 5)
357 self.checkmethod('zfill', u'', u'000', 3)
358 self.checkmethod('zfill', u'34', u'34', 1)
359 self.checkmethod('zfill', u'34', u'00034', 5)
360
361 self.assertRaises(TypeError, u"123".zfill)
362
363 def test_comparison(self):
364 # Comparisons:
365 self.assertEqual(u'abc', 'abc')
366 self.assertEqual('abc', u'abc')
367 self.assertEqual(u'abc', u'abc')
368 self.assert_(u'abcd' > 'abc')
369 self.assert_('abcd' > u'abc')
370 self.assert_(u'abcd' > u'abc')
371 self.assert_(u'abc' < 'abcd')
372 self.assert_('abc' < u'abcd')
373 self.assert_(u'abc' < u'abcd')
374
375 if 0:
376 # Move these tests to a Unicode collation module test...
377 # Testing UTF-16 code point order comparisons...
378
379 # No surrogates, no fixup required.
380 self.assert_(u'\u0061' < u'\u20ac')
381 # Non surrogate below surrogate value, no fixup required
382 self.assert_(u'\u0061' < u'\ud800\udc02')
383
384 # Non surrogate above surrogate value, fixup required
385 def test_lecmp(s, s2):
386 self.assert_(s < s2)
387
388 def test_fixup(s):
389 s2 = u'\ud800\udc01'
390 test_lecmp(s, s2)
391 s2 = u'\ud900\udc01'
392 test_lecmp(s, s2)
393 s2 = u'\uda00\udc01'
394 test_lecmp(s, s2)
395 s2 = u'\udb00\udc01'
396 test_lecmp(s, s2)
397 s2 = u'\ud800\udd01'
398 test_lecmp(s, s2)
399 s2 = u'\ud900\udd01'
400 test_lecmp(s, s2)
401 s2 = u'\uda00\udd01'
402 test_lecmp(s, s2)
403 s2 = u'\udb00\udd01'
404 test_lecmp(s, s2)
405 s2 = u'\ud800\ude01'
406 test_lecmp(s, s2)
407 s2 = u'\ud900\ude01'
408 test_lecmp(s, s2)
409 s2 = u'\uda00\ude01'
410 test_lecmp(s, s2)
411 s2 = u'\udb00\ude01'
412 test_lecmp(s, s2)
413 s2 = u'\ud800\udfff'
414 test_lecmp(s, s2)
415 s2 = u'\ud900\udfff'
416 test_lecmp(s, s2)
417 s2 = u'\uda00\udfff'
418 test_lecmp(s, s2)
419 s2 = u'\udb00\udfff'
420 test_lecmp(s, s2)
421
422 test_fixup(u'\ue000')
423 test_fixup(u'\uff61')
424
425 # Surrogates on both sides, no fixup required
426 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
427
428 def test_ljust(self):
429 self.checkmethod('ljust', u'abc', u'abc ', 10)
430 self.checkmethod('ljust', u'abc', u'abc ', 6)
431 self.checkmethod('ljust', u'abc', u'abc', 2)
432
433 self.assertRaises(TypeError, u"abc".ljust)
434
435 def test_rjust(self):
436 self.checkmethod('rjust', u'abc', u' abc', 10)
437 self.checkmethod('rjust', u'abc', u' abc', 6)
438 self.checkmethod('rjust', u'abc', u'abc', 2)
439
440 self.assertRaises(TypeError, u"abc".rjust)
441
442 def test_center(self):
443 self.checkmethod('center', u'abc', u' abc ', 10)
444 self.checkmethod('center', u'abc', u' abc ', 6)
445 self.checkmethod('center', u'abc', u'abc', 2)
446
447 self.assertRaises(TypeError, u"abc".center)
448
449 def test_islower(self):
450 self.checkmethod('islower', u'', False)
451 self.checkmethod('islower', u'a', True)
452 self.checkmethod('islower', u'A', False)
453 self.checkmethod('islower', u'\n', False)
454 self.checkmethod('islower', u'\u1FFc', False)
455 self.checkmethod('islower', u'abc', True)
456 self.checkmethod('islower', u'aBc', False)
457 self.checkmethod('islower', u'abc\n', True)
458
459 self.assertRaises(TypeError, u"abc".islower, 42)
460
461 def test_isupper(self):
462 self.checkmethod('isupper', u'', False)
463 self.checkmethod('isupper', u'a', False)
464 self.checkmethod('isupper', u'A', True)
465 self.checkmethod('isupper', u'\n', False)
466 if sys.platform[:4] != 'java':
467 self.checkmethod('isupper', u'\u1FFc', False)
468 self.checkmethod('isupper', u'ABC', True)
469 self.checkmethod('isupper', u'AbC', False)
470 self.checkmethod('isupper', u'ABC\n', True)
471
472 self.assertRaises(TypeError, u"abc".isupper, 42)
473
474 def test_istitle(self):
475 self.checkmethod('istitle', u'', False)
476 self.checkmethod('istitle', u'a', False)
477 self.checkmethod('istitle', u'A', True)
478 self.checkmethod('istitle', u'\n', False)
479 self.checkmethod('istitle', u'\u1FFc', True)
480 self.checkmethod('istitle', u'A Titlecased Line', True)
481 self.checkmethod('istitle', u'A\nTitlecased Line', True)
482 self.checkmethod('istitle', u'A Titlecased, Line', True)
483 self.checkmethod('istitle', u'Greek \u1FFcitlecases ...', True)
484 self.checkmethod('istitle', u'Not a capitalized String', False)
485 self.checkmethod('istitle', u'Not\ta Titlecase String', False)
486 self.checkmethod('istitle', u'Not--a Titlecase String', False)
487 self.checkmethod('istitle', u'NOT', False)
488
489 self.assertRaises(TypeError, u"abc".istitle, 42)
490
491 def test_isspace(self):
492 self.checkmethod('isspace', u'', False)
493 self.checkmethod('isspace', u'a', False)
494 self.checkmethod('isspace', u' ', True)
495 self.checkmethod('isspace', u'\t', True)
496 self.checkmethod('isspace', u'\r', True)
497 self.checkmethod('isspace', u'\n', True)
498 self.checkmethod('isspace', u' \t\r\n', True)
499 self.checkmethod('isspace', u' \t\r\na', False)
500
501 self.assertRaises(TypeError, u"abc".isspace, 42)
502
503 def test_isalpha(self):
504 self.checkmethod('isalpha', u'', False)
505 self.checkmethod('isalpha', u'a', True)
506 self.checkmethod('isalpha', u'A', True)
507 self.checkmethod('isalpha', u'\n', False)
508 self.checkmethod('isalpha', u'\u1FFc', True)
509 self.checkmethod('isalpha', u'abc', True)
510 self.checkmethod('isalpha', u'aBc123', False)
511 self.checkmethod('isalpha', u'abc\n', False)
512
513 self.assertRaises(TypeError, u"abc".isalpha, 42)
514
515 def test_isalnum(self):
516 self.checkmethod('isalnum', u'', False)
517 self.checkmethod('isalnum', u'a', True)
518 self.checkmethod('isalnum', u'A', True)
519 self.checkmethod('isalnum', u'\n', False)
520 self.checkmethod('isalnum', u'123abc456', True)
521 self.checkmethod('isalnum', u'a1b3c', True)
522 self.checkmethod('isalnum', u'aBc000 ', False)
523 self.checkmethod('isalnum', u'abc\n', False)
524
525 self.assertRaises(TypeError, u"abc".isalnum, 42)
526
527 def test_isdecimal(self):
528 self.checkmethod('isdecimal', u'', False)
529 self.checkmethod('isdecimal', u'a', False)
530 self.checkmethod('isdecimal', u'0', True)
531 self.checkmethod('isdecimal', u'\u2460', False) # CIRCLED DIGIT ONE
532 self.checkmethod('isdecimal', u'\xbc', False) # VULGAR FRACTION ONE QUARTER
533 self.checkmethod('isdecimal', u'\u0660', True) # ARABIC-INDIC DIGIT ZERO
534 self.checkmethod('isdecimal', u'0123456789', True)
535 self.checkmethod('isdecimal', u'0123456789a', False)
536
537 self.assertRaises(TypeError, u"abc".isdecimal, 42)
538
539 def test_isdigit(self):
540 self.checkmethod('isdigit', u'', False)
541 self.checkmethod('isdigit', u'a', False)
542 self.checkmethod('isdigit', u'0', True)
543 self.checkmethod('isdigit', u'\u2460', True)
544 self.checkmethod('isdigit', u'\xbc', False)
545 self.checkmethod('isdigit', u'\u0660', True)
546 self.checkmethod('isdigit', u'0123456789', True)
547 self.checkmethod('isdigit', u'0123456789a', False)
548
549 self.assertRaises(TypeError, u"abc".isdigit, 42)
550
551 def test_isnumeric(self):
552 self.checkmethod('isnumeric', u'', False)
553 self.checkmethod('isnumeric', u'a', False)
554 self.checkmethod('isnumeric', u'0', True)
555 self.checkmethod('isnumeric', u'\u2460', True)
556 self.checkmethod('isnumeric', u'\xbc', True)
557 self.checkmethod('isnumeric', u'\u0660', True)
558 self.checkmethod('isnumeric', u'0123456789', True)
559 self.checkmethod('isnumeric', u'0123456789a', False)
560
561 self.assertRaises(TypeError, u"abc".isnumeric, 42)
562
563 def test_splitlines(self):
564 self.checkmethod('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
565 self.checkmethod('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
566 self.checkmethod('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
567 self.checkmethod('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
568 self.checkmethod('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
569 self.checkmethod('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
570 self.checkmethod('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], True)
571
572 self.assertRaises(TypeError, u"abc".splitlines, 42, 42)
573
574 def test_contains(self):
575 # Testing Unicode contains method
576 self.assert_('a' in u'abdb')
577 self.assert_('a' in u'bdab')
578 self.assert_('a' in u'bdaba')
579 self.assert_('a' in u'bdba')
580 self.assert_('a' in u'bdba')
581 self.assert_(u'a' in u'bdba')
582 self.assert_(u'a' not in u'bdb')
583 self.assert_(u'a' not in 'bdb')
584 self.assert_(u'a' in 'bdba')
585 self.assert_(u'a' in ('a',1,None))
586 self.assert_(u'a' in (1,None,'a'))
587 self.assert_(u'a' in (1,None,u'a'))
588 self.assert_('a' in ('a',1,None))
589 self.assert_('a' in (1,None,'a'))
590 self.assert_('a' in (1,None,u'a'))
591 self.assert_('a' not in ('x',1,u'y'))
592 self.assert_('a' not in ('x',1,None))
593 self.assert_(u'abcd' not in u'abcxxxx')
594 self.assert_(u'ab' in u'abcd')
595 self.assert_('ab' in u'abc')
596 self.assert_(u'ab' in 'abc')
597 self.assert_(u'ab' in (1,None,u'ab'))
598 self.assert_(u'' in u'abc')
599 self.assert_('' in u'abc')
600
601 # If the following fails either
602 # the contains operator does not propagate UnicodeErrors or
603 # someone has changed the default encoding
604 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
605
606 self.assert_(u'' in '')
607 self.assert_('' in u'')
608 self.assert_(u'' in u'')
609 self.assert_(u'' in 'abc')
610 self.assert_('' in u'abc')
611 self.assert_(u'' in u'abc')
612 self.assert_(u'\0' not in 'abc')
613 self.assert_('\0' not in u'abc')
614 self.assert_(u'\0' not in u'abc')
615 self.assert_(u'\0' in '\0abc')
616 self.assert_('\0' in u'\0abc')
617 self.assert_(u'\0' in u'\0abc')
618 self.assert_(u'\0' in 'abc\0')
619 self.assert_('\0' in u'abc\0')
620 self.assert_(u'\0' in u'abc\0')
621 self.assert_(u'a' in '\0abc')
622 self.assert_('a' in u'\0abc')
623 self.assert_(u'a' in u'\0abc')
624 self.assert_(u'asdf' in 'asdf')
625 self.assert_('asdf' in u'asdf')
626 self.assert_(u'asdf' in u'asdf')
627 self.assert_(u'asdf' not in 'asd')
628 self.assert_('asdf' not in u'asd')
629 self.assert_(u'asdf' not in u'asd')
630 self.assert_(u'asdf' not in '')
631 self.assert_('asdf' not in u'')
632 self.assert_(u'asdf' not in u'')
633
634 self.assertRaises(TypeError, u"abc".__contains__)
635
636 def test_formatting(self):
637 # Testing Unicode formatting strings...
638 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
639 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
640 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
641 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
642 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
643 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
644 self.assertEqual(u"%c" % (u"a",), u'a')
645 self.assertEqual(u"%c" % ("a",), u'a')
646 self.assertEqual(u"%c" % (34,), u'"')
647 self.assertEqual(u"%c" % (36,), u'$')
648 self.assertEqual(u"%d".__mod__(10), u'10')
649 if not sys.platform.startswith('java'):
650 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
651 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
652 self.assertEqual(u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}, u'abc, def')
653
654 for ordinal in (-100, 0x200000):
655 self.assertRaises(ValueError, u"%c".__mod__, ordinal)
656
657 # float formatting
658 for prec in xrange(100):
659 format = u'%%.%if' % prec
660 value = 0.01
661 for x in xrange(60):
662 value = value * 3.141592655 / 3.0 * 10.0
663 # The formatfloat() code in stringobject.c and
664 # unicodeobject.c uses a 120 byte buffer and switches from
665 # 'f' formatting to 'g' at precision 50, so we expect
666 # OverflowErrors for the ranges x < 50 and prec >= 67.
667 if x < 50 and prec >= 67:
668 self.assertRaises(OverflowError, format.__mod__, value)
669 else:
670 format % value
671
672 # formatting jobs delegated from the string implementation:
673 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
674 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
675 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
676 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
677 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
678 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
679 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
680 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
681 self.assertEqual('...%s...' % u"abc", u'...abc...')
682 self.assertEqual('%*s' % (5,u'abc',), u' abc')
683 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
684 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
685 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
686 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
687 self.assertEqual('%i%s %*.*s' % (10, 3, 5,3,u'abc',), u'103 abc')
688
689 self.assertEqual(u'%3ld' % 42, u' 42')
690 self.assertEqual(u'%07.2f' % 42, u'0042.00')
691
692 self.assertRaises(TypeError, u"abc".__mod__)
693 self.assertRaises(TypeError, u"%(foo)s".__mod__, 42)
694 self.assertRaises(TypeError, u"%s%s".__mod__, (42,))
695 self.assertRaises(TypeError, u"%c".__mod__, (None,))
696 self.assertRaises(ValueError, u"%c".__mod__, (sys.maxunicode+1,))
697 self.assertRaises(ValueError, u"%(foo".__mod__, {})
698 self.assertRaises(TypeError, u"%(foo)s %(bar)s".__mod__, (u"foo", 42))
699
700 # argument names with properly nested brackets are supported
701 self.assertEqual(u"%((foo))s" % {u"(foo)": u"bar"}, u"bar")
702
703 # 100 is a magic number in PyUnicode_Format, this forces a resize
704 self.assertEqual(u"%sx" % (103*u"a"), 103*u"a"+u"x")
705
706 self.assertRaises(TypeError, u"%*s".__mod__, (u"foo", u"bar"))
707 self.assertRaises(TypeError, u"%10.*f".__mod__, (u"foo", 42.))
708 self.assertRaises(ValueError, u"%10".__mod__, (42,))
709
710 def test_constructor(self):
711 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
712
713 self.assertEqual(
714 unicode(u'unicode remains unicode'),
715 u'unicode remains unicode'
716 )
717
718 class UnicodeSubclass(unicode):
Marc-André Lemburg79f57832002-12-29 19:44:06 +0000719 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +0000720
Walter Dörwald28256f22003-01-19 16:59:20 +0000721 self.assertEqual(
722 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
723 u'unicode subclass becomes unicode'
724 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000725
Walter Dörwald28256f22003-01-19 16:59:20 +0000726 self.assertEqual(
727 unicode('strings are converted to unicode'),
728 u'strings are converted to unicode'
729 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000730
Walter Dörwald28256f22003-01-19 16:59:20 +0000731 class UnicodeCompat:
732 def __init__(self, x):
733 self.x = x
734 def __unicode__(self):
735 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000736
Walter Dörwald28256f22003-01-19 16:59:20 +0000737 self.assertEqual(
738 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
739 u'__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000740
Walter Dörwald28256f22003-01-19 16:59:20 +0000741 class StringCompat:
742 def __init__(self, x):
743 self.x = x
744 def __str__(self):
745 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000746
Walter Dörwald28256f22003-01-19 16:59:20 +0000747 self.assertEqual(
748 unicode(StringCompat('__str__ compatible objects are recognized')),
749 u'__str__ compatible objects are recognized'
750 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000751
Walter Dörwald28256f22003-01-19 16:59:20 +0000752 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000753
Walter Dörwald28256f22003-01-19 16:59:20 +0000754 o = StringCompat('unicode(obj) is compatible to str()')
755 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
756 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000757
Walter Dörwald28256f22003-01-19 16:59:20 +0000758 for obj in (123, 123.45, 123L):
759 self.assertEqual(unicode(obj), unicode(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000760
Walter Dörwald28256f22003-01-19 16:59:20 +0000761 # unicode(obj, encoding, error) tests (this maps to
762 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000763
Walter Dörwald28256f22003-01-19 16:59:20 +0000764 if not sys.platform.startswith('java'):
765 self.assertRaises(
766 TypeError,
767 unicode,
768 u'decoding unicode is not supported',
769 'utf-8',
770 'strict'
771 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000772
Walter Dörwald28256f22003-01-19 16:59:20 +0000773 self.assertEqual(
774 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
775 u'strings are decoded to unicode'
776 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000777
Walter Dörwald28256f22003-01-19 16:59:20 +0000778 if not sys.platform.startswith('java'):
779 self.assertEqual(
780 unicode(
781 buffer('character buffers are decoded to unicode'),
782 'utf-8',
783 'strict'
784 ),
785 u'character buffers are decoded to unicode'
786 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000787
Walter Dörwald28256f22003-01-19 16:59:20 +0000788 self.assertRaises(TypeError, unicode, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000789
Walter Dörwald28256f22003-01-19 16:59:20 +0000790 def test_codecs_utf7(self):
791 utfTests = [
792 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
793 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
794 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
795 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
796 (u'+', '+-'),
797 (u'+-', '+--'),
798 (u'+?', '+-?'),
799 (u'\?', '+AFw?'),
800 (u'+?', '+-?'),
801 (ur'\\?', '+AFwAXA?'),
802 (ur'\\\?', '+AFwAXABc?'),
803 (ur'++--', '+-+---')
804 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000805
Walter Dörwald28256f22003-01-19 16:59:20 +0000806 for (x, y) in utfTests:
807 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000808
Walter Dörwald28256f22003-01-19 16:59:20 +0000809 # surrogates not supported
810 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000811
Walter Dörwald28256f22003-01-19 16:59:20 +0000812 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000813
Walter Dörwald28256f22003-01-19 16:59:20 +0000814 def test_codecs_utf8(self):
815 self.assertEqual(u''.encode('utf-8'), '')
816 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
817 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
818 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
819 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
820 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
821 self.assertEqual(
822 (u'\ud800\udc02'*1000).encode('utf-8'),
823 '\xf0\x90\x80\x82'*1000
824 )
825 self.assertEqual(
826 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
827 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
828 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
829 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
830 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
831 u' Nunstuck git und'.encode('utf-8'),
832 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
833 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
834 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
835 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
836 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
837 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
838 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
839 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
840 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
841 '\xe3\x80\x8cWenn ist das Nunstuck git und'
842 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000843
Walter Dörwald28256f22003-01-19 16:59:20 +0000844 # UTF-8 specific decoding tests
845 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
846 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
847 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000848
Walter Dörwald28256f22003-01-19 16:59:20 +0000849 # Other possible utf-8 test cases:
850 # * strict decoding testing for all of the
851 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000852
Walter Dörwald28256f22003-01-19 16:59:20 +0000853 def test_codecs_errors(self):
854 # Error handling (encoding)
855 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
856 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
857 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
858 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000859
Walter Dörwald28256f22003-01-19 16:59:20 +0000860 # Error handling (decoding)
861 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
862 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
863 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
864 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000865
Walter Dörwald28256f22003-01-19 16:59:20 +0000866 # Error handling (unknown character names)
867 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000868
Walter Dörwald28256f22003-01-19 16:59:20 +0000869 # Error handling (truncated escape sequence)
870 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000871
Walter Dörwald28256f22003-01-19 16:59:20 +0000872 # Error handling (bad decoder return)
873 def search_function(encoding):
874 def decode1(input, errors="strict"):
875 return 42 # not a tuple
876 def encode1(input, errors="strict"):
877 return 42 # not a tuple
878 def encode2(input, errors="strict"):
879 return (42, 42) # no unicode
880 def decode2(input, errors="strict"):
881 return (42, 42) # no unicode
882 if encoding=="test.unicode1":
883 return (encode1, decode1, None, None)
884 elif encoding=="test.unicode2":
885 return (encode2, decode2, None, None)
886 else:
887 return None
888 codecs.register(search_function)
889 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
890 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
891 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
892 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
893 # executes PyUnicode_Encode()
894 import imp
895 self.assertRaises(
896 ImportError,
897 imp.find_module,
898 "non-existing module",
899 [u"non-existing dir"]
900 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000901
Walter Dörwald28256f22003-01-19 16:59:20 +0000902 # Error handling (wrong arguments)
903 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000904
Walter Dörwald28256f22003-01-19 16:59:20 +0000905 # Error handling (PyUnicode_EncodeDecimal())
906 self.assertRaises(UnicodeError, int, u"\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +0000907
Walter Dörwald28256f22003-01-19 16:59:20 +0000908 def test_codecs(self):
909 # Encoding
910 self.assertEqual(u'hello'.encode('ascii'), 'hello')
911 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
912 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
913 self.assertEqual(u'hello'.encode('utf8'), 'hello')
914 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
915 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
916 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +0000917
Walter Dörwald28256f22003-01-19 16:59:20 +0000918 # Roundtrip safety for BMP (just the first 1024 chars)
919 u = u''.join(map(unichr, xrange(1024)))
920 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
921 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
922 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +0000923
Walter Dörwald28256f22003-01-19 16:59:20 +0000924 # Roundtrip safety for BMP (just the first 256 chars)
925 u = u''.join(map(unichr, xrange(256)))
926 for encoding in ('latin-1',):
927 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000928
Walter Dörwald28256f22003-01-19 16:59:20 +0000929 # Roundtrip safety for BMP (just the first 128 chars)
930 u = u''.join(map(unichr, xrange(128)))
931 for encoding in ('ascii',):
932 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000933
Walter Dörwald28256f22003-01-19 16:59:20 +0000934 # Roundtrip safety for non-BMP (just a few chars)
935 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
936 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
937 #'raw_unicode_escape',
938 'unicode_escape', 'unicode_internal'):
939 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000940
Walter Dörwald28256f22003-01-19 16:59:20 +0000941 # UTF-8 must be roundtrip safe for all UCS-2 code points
942 # This excludes surrogates: in the full range, there would be
943 # a surrogate pair (\udbff\udc00), which gets converted back
944 # to a non-BMP character (\U0010fc00)
945 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
946 for encoding in ('utf-8',):
947 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000948
Walter Dörwald28256f22003-01-19 16:59:20 +0000949 def test_codecs_charmap(self):
950 # 0-127
951 s = ''.join(map(chr, xrange(128)))
952 for encoding in (
953 'cp037', 'cp1026',
954 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
955 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
956 'cp863', 'cp865', 'cp866',
957 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
958 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
959 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
960 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000961
Walter Dörwald28256f22003-01-19 16:59:20 +0000962 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
963 'cp1256', 'cp1257', 'cp1258',
964 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000965
Walter Dörwald28256f22003-01-19 16:59:20 +0000966 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
967 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000968
Walter Dörwald28256f22003-01-19 16:59:20 +0000969 ### These have undefined mappings:
970 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000971
Walter Dörwald28256f22003-01-19 16:59:20 +0000972 ### These fail the round-trip:
973 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +0000974
Walter Dörwald28256f22003-01-19 16:59:20 +0000975 ):
976 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000977
Walter Dörwald28256f22003-01-19 16:59:20 +0000978 # 128-255
979 s = ''.join(map(chr, xrange(128, 256)))
980 for encoding in (
981 'cp037', 'cp1026',
982 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
983 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
984 'cp863', 'cp865', 'cp866',
985 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
986 'iso8859_2', 'iso8859_4', 'iso8859_5',
987 'iso8859_9', 'koi8_r', 'latin_1',
988 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000989
Walter Dörwald28256f22003-01-19 16:59:20 +0000990 ### These have undefined mappings:
991 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
992 #'cp1256', 'cp1257', 'cp1258',
993 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
994 #'iso8859_3', 'iso8859_6', 'iso8859_7',
995 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000996
Walter Dörwald28256f22003-01-19 16:59:20 +0000997 ### These fail the round-trip:
998 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +0000999
Walter Dörwald28256f22003-01-19 16:59:20 +00001000 ):
1001 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001002
Walter Dörwald28256f22003-01-19 16:59:20 +00001003 def test_concatenation(self):
1004 self.assertEqual((u"abc" u"def"), u"abcdef")
1005 self.assertEqual(("abc" u"def"), u"abcdef")
1006 self.assertEqual((u"abc" "def"), u"abcdef")
1007 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
1008 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00001009
Walter Dörwald28256f22003-01-19 16:59:20 +00001010 def test_printing(self):
1011 class BitBucket:
1012 def write(self, text):
1013 pass
Fred Drake004d5e62000-10-23 17:22:08 +00001014
Walter Dörwald28256f22003-01-19 16:59:20 +00001015 out = BitBucket()
1016 print >>out, u'abc'
1017 print >>out, u'abc', u'def'
1018 print >>out, u'abc', 'def'
1019 print >>out, 'abc', u'def'
1020 print >>out, u'abc\n'
1021 print >>out, u'abc\n',
1022 print >>out, u'abc\n',
1023 print >>out, u'def\n'
1024 print >>out, u'def\n'
Fred Drake004d5e62000-10-23 17:22:08 +00001025
Walter Dörwald28256f22003-01-19 16:59:20 +00001026 def test_mul(self):
1027 self.checkmethod('__mul__', u'abc', u'', -1)
1028 self.checkmethod('__mul__', u'abc', u'', 0)
1029 self.checkmethod('__mul__', u'abc', u'abc', 1)
1030 self.checkmethod('__mul__', u'abc', u'abcabcabc', 3)
1031 self.assertRaises(OverflowError, (10000*u'abc').__mul__, sys.maxint)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001032
Walter Dörwald28256f22003-01-19 16:59:20 +00001033 def test_subscript(self):
1034 self.checkmethod('__getitem__', u'abc', u'a', 0)
1035 self.checkmethod('__getitem__', u'abc', u'c', -1)
1036 self.checkmethod('__getitem__', u'abc', u'a', 0L)
1037 self.checkmethod('__getitem__', u'abc', u'abc', slice(0, 3))
1038 self.checkmethod('__getitem__', u'abc', u'abc', slice(0, 1000))
1039 self.checkmethod('__getitem__', u'abc', u'a', slice(0, 1))
1040 self.checkmethod('__getitem__', u'abc', u'', slice(0, 0))
1041 # FIXME What about negative indizes? This is handled differently by [] and __getitem__(slice)
Fred Drakee0243e22000-04-13 14:11:56 +00001042
Walter Dörwald28256f22003-01-19 16:59:20 +00001043 self.assertRaises(TypeError, u"abc".__getitem__, "def")
Marc-André Lemburg0c4d8d02001-11-20 15:17:25 +00001044
Walter Dörwald28256f22003-01-19 16:59:20 +00001045 def test_slice(self):
1046 self.checkmethod('__getslice__', u'abc', u'abc', 0, 1000)
1047 self.checkmethod('__getslice__', u'abc', u'abc', 0, 3)
1048 self.checkmethod('__getslice__', u'abc', u'ab', 0, 2)
1049 self.checkmethod('__getslice__', u'abc', u'bc', 1, 3)
1050 self.checkmethod('__getslice__', u'abc', u'b', 1, 2)
1051 self.checkmethod('__getslice__', u'abc', u'', 2, 2)
1052 self.checkmethod('__getslice__', u'abc', u'', 1000, 1000)
1053 self.checkmethod('__getslice__', u'abc', u'', 2000, 1000)
1054 self.checkmethod('__getslice__', u'abc', u'', 2, 1)
1055 # FIXME What about negative indizes? This is handled differently by [] and __getslice__
Barry Warsaw817918c2002-08-06 16:58:21 +00001056
Walter Dörwald28256f22003-01-19 16:59:20 +00001057def test_main():
1058 suite = unittest.TestSuite()
1059 suite.addTest(unittest.makeSuite(UnicodeTest))
1060 test.test_support.run_suite(suite)
Barry Warsaw817918c2002-08-06 16:58:21 +00001061
Walter Dörwald28256f22003-01-19 16:59:20 +00001062if __name__ == "__main__":
1063 test_main()