blob: 130a418dc5ee78a39b5da80146638eaf3e7d4911 [file] [log] [blame]
Martin v. Löwisa729daf2002-08-04 17:28:33 +00001# -*- coding: iso-8859-1 -*-
Guido van Rossuma831cac2000-03-10 23:23:21 +00002""" Test script for the Unicode implementation.
3
Guido van Rossuma831cac2000-03-10 23:23:21 +00004Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
Marc-André Lemburg36619082001-01-17 19:11:13 +00008"""#"
Walter Dörwald28256f22003-01-19 16:59:20 +00009import unittest, test.test_support
10import sys, string, codecs
Guido van Rossuma831cac2000-03-10 23:23:21 +000011
Walter Dörwald28256f22003-01-19 16:59:20 +000012class UnicodeTest(unittest.TestCase):
Guido van Rossume4874ae2001-09-21 15:36:41 +000013
Walter Dörwald28256f22003-01-19 16:59:20 +000014 def test_repr(self):
15 if not sys.platform.startswith('java'):
16 # Test basic sanity of repr()
17 self.assertEqual(repr(u'abc'), "u'abc'")
18 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
19 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
20 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
21 self.assertEqual(repr(u'\\'), "u'\\\\'")
22 self.assertEqual(repr(u'\n'), "u'\\n'")
23 self.assertEqual(repr(u'\r'), "u'\\r'")
24 self.assertEqual(repr(u'\t'), "u'\\t'")
25 self.assertEqual(repr(u'\b'), "u'\\x08'")
26 self.assertEqual(repr(u"'\""), """u'\\'"'""")
27 self.assertEqual(repr(u"'\""), """u'\\'"'""")
28 self.assertEqual(repr(u"'"), '''u"'"''')
29 self.assertEqual(repr(u'"'), """u'"'""")
30 latin1repr = (
31 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
32 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
33 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
34 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
35 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
36 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
37 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
38 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
39 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
40 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
41 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
42 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
43 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
44 "\\xfe\\xff'")
45 testrepr = repr(u''.join(map(unichr, xrange(256))))
46 self.assertEqual(testrepr, latin1repr)
47
48 def checkmethod(self, method, input, output, *args):
Guido van Rossuma831cac2000-03-10 23:23:21 +000049 f = getattr(input, method)
Walter Dörwald28256f22003-01-19 16:59:20 +000050 value = f(*args)
51 self.assertEqual(output, value)
52 self.assert_(type(output) is type(value))
53
Walter Dörwald2ee4be02002-04-17 21:34:05 +000054 # if the original is returned make sure that
55 # this doesn't happen with subclasses
56 if value is input:
57 class usub(unicode):
58 def __repr__(self):
59 return 'usub(%r)' % unicode.__repr__(self)
60 input = usub(input)
Walter Dörwald28256f22003-01-19 16:59:20 +000061 f = getattr(input, method)
62 value = f(*args)
63 self.assertEqual(output, value)
64 self.assert_(input is not value)
Guido van Rossuma831cac2000-03-10 23:23:21 +000065
Walter Dörwald28256f22003-01-19 16:59:20 +000066 def test_capitalize(self):
67 self.checkmethod('capitalize', u' hello ', u' hello ')
68 self.checkmethod('capitalize', u'Hello ', u'Hello ')
69 self.checkmethod('capitalize', u'hello ', u'Hello ')
70 self.checkmethod('capitalize', u'aaaa', u'Aaaa')
71 self.checkmethod('capitalize', u'AaAa', u'Aaaa')
Guido van Rossuma831cac2000-03-10 23:23:21 +000072
Walter Dörwald28256f22003-01-19 16:59:20 +000073 self.assertRaises(TypeError, u'hello'.capitalize, 42)
Marc-André Lemburg3a645e42001-01-16 11:54:12 +000074
Walter Dörwald28256f22003-01-19 16:59:20 +000075 def test_count(self):
76 self.checkmethod('count', u'aaa', 3, u'a')
77 self.checkmethod('count', u'aaa', 0, u'b')
78 self.checkmethod('count', 'aaa', 3, u'a')
79 self.checkmethod('count', 'aaa', 0, u'b')
80 self.checkmethod('count', u'aaa', 3, 'a')
81 self.checkmethod('count', u'aaa', 0, 'b')
Guido van Rossuma831cac2000-03-10 23:23:21 +000082
Walter Dörwald28256f22003-01-19 16:59:20 +000083 self.assertRaises(TypeError, u'hello'.count)
Guido van Rossuma831cac2000-03-10 23:23:21 +000084
Walter Dörwald28256f22003-01-19 16:59:20 +000085 def test_title(self):
86 self.checkmethod('title', u' hello ', u' Hello ')
87 self.checkmethod('title', u'Hello ', u'Hello ')
88 self.checkmethod('title', u'hello ', u'Hello ')
89 self.checkmethod('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
90 self.checkmethod('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
91 self.checkmethod('title', u"getInt", u'Getint')
Guido van Rossuma831cac2000-03-10 23:23:21 +000092
Walter Dörwald28256f22003-01-19 16:59:20 +000093 self.assertRaises(TypeError, u'hello'.count, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +000094
Walter Dörwald28256f22003-01-19 16:59:20 +000095 def test_find(self):
96 self.checkmethod('find', u'abcdefghiabc', 0, u'abc')
97 self.checkmethod('find', u'abcdefghiabc', 9, u'abc', 1)
98 self.checkmethod('find', u'abcdefghiabc', -1, u'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +000099
Walter Dörwald28256f22003-01-19 16:59:20 +0000100 self.assertRaises(TypeError, u'hello'.find)
101 self.assertRaises(TypeError, u'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000102
Walter Dörwald28256f22003-01-19 16:59:20 +0000103 def test_rfind(self):
104 self.checkmethod('rfind', u'abcdefghiabc', 9, u'abc')
105 self.checkmethod('rfind', 'abcdefghiabc', 9, u'abc')
106 self.checkmethod('rfind', 'abcdefghiabc', 12, u'')
107 self.checkmethod('rfind', u'abcdefghiabc', 12, '')
108 self.checkmethod('rfind', u'abcdefghiabc', 12, u'')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000109
Walter Dörwald28256f22003-01-19 16:59:20 +0000110 self.assertRaises(TypeError, u'hello'.rfind)
111 self.assertRaises(TypeError, u'hello'.rfind, 42)
Guido van Rossum8b264542000-12-19 02:22:31 +0000112
Walter Dörwald28256f22003-01-19 16:59:20 +0000113 def test_index(self):
114 self.checkmethod('index', u'abcdefghiabc', 0, u'')
115 self.checkmethod('index', u'abcdefghiabc', 3, u'def')
116 self.checkmethod('index', u'abcdefghiabc', 0, u'abc')
117 self.checkmethod('index', u'abcdefghiabc', 9, u'abc', 1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000118
Walter Dörwald28256f22003-01-19 16:59:20 +0000119 self.assertRaises(ValueError, u'abcdefghiabc'.index, u'hib')
120 self.assertRaises(ValueError, u'abcdefghiab'.index, u'abc', 1)
121 self.assertRaises(ValueError, u'abcdefghi'.index, u'ghi', 8)
122 self.assertRaises(ValueError, u'abcdefghi'.index, u'ghi', -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000123
Walter Dörwald28256f22003-01-19 16:59:20 +0000124 self.assertRaises(TypeError, u'hello'.index)
125 self.assertRaises(TypeError, u'hello'.index, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000126
Walter Dörwald28256f22003-01-19 16:59:20 +0000127 def test_rindex(self):
128 self.checkmethod('rindex', u'abcdefghiabc', 12, u'')
129 self.checkmethod('rindex', u'abcdefghiabc', 3, u'def')
130 self.checkmethod('rindex', u'abcdefghiabc', 9, u'abc')
131 self.checkmethod('rindex', u'abcdefghiabc', 0, u'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000132
Walter Dörwald28256f22003-01-19 16:59:20 +0000133 self.assertRaises(ValueError, u'abcdefghiabc'.rindex, u'hib')
134 self.assertRaises(ValueError, u'defghiabc'.rindex, u'def', 1)
135 self.assertRaises(ValueError, u'defghiabc'.rindex, u'abc', 0, -1)
136 self.assertRaises(ValueError, u'abcdefghi'.rindex, u'ghi', 0, 8)
137 self.assertRaises(ValueError, u'abcdefghi'.rindex, u'ghi', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000138
Walter Dörwald28256f22003-01-19 16:59:20 +0000139 self.assertRaises(TypeError, u'hello'.rindex)
140 self.assertRaises(TypeError, u'hello'.rindex, 42)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +0000141
Walter Dörwald28256f22003-01-19 16:59:20 +0000142 def test_lower(self):
143 self.checkmethod('lower', u'HeLLo', u'hello')
144 self.checkmethod('lower', u'hello', u'hello')
Walter Dörwaldde02bcb2002-04-22 17:42:37 +0000145
Walter Dörwald28256f22003-01-19 16:59:20 +0000146 self.assertRaises(TypeError, u"hello".lower, 42)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +0000147
Walter Dörwald28256f22003-01-19 16:59:20 +0000148 def test_upper(self):
149 self.checkmethod('upper', u'HeLLo', u'HELLO')
150 self.checkmethod('upper', u'HELLO', u'HELLO')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000151
Walter Dörwald28256f22003-01-19 16:59:20 +0000152 self.assertRaises(TypeError, u'hello'.upper, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000153
Walter Dörwald28256f22003-01-19 16:59:20 +0000154 def test_translate(self):
155 if 0:
156 transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
Guido van Rossuma831cac2000-03-10 23:23:21 +0000157
Walter Dörwald28256f22003-01-19 16:59:20 +0000158 self.checkmethod('maketrans', u'abc', transtable, u'xyz')
159 self.checkmethod('maketrans', u'abc', ValueError, u'xyzq')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000160
Walter Dörwald28256f22003-01-19 16:59:20 +0000161 self.checkmethod('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000162
Walter Dörwald28256f22003-01-19 16:59:20 +0000163 table = string.maketrans('a', u'A')
164 self.checkmethod('translate', u'abc', u'Abc', table)
165 self.checkmethod('translate', u'xyz', u'xyz', table)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000166
Walter Dörwald28256f22003-01-19 16:59:20 +0000167 self.checkmethod('translate', u"abababc", u'bbbc', {ord('a'):None})
168 self.checkmethod('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
169 self.checkmethod('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
170 self.checkmethod('translate', u"abababc", u'<i><i><i>c', {ord('a'):None, ord('b'):u'<i>'})
171 self.checkmethod('translate', u"abababc", u'c', {ord('a'):None, ord('b'):u''})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000172
Walter Dörwald28256f22003-01-19 16:59:20 +0000173 self.assertRaises(TypeError, u'hello'.translate)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000174
Walter Dörwald28256f22003-01-19 16:59:20 +0000175 def test_split(self):
176 self.checkmethod(
177 'split',
178 u'this is the split function',
179 [u'this', u'is', u'the', u'split', u'function']
180 )
181 self.checkmethod('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
182 self.checkmethod('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
183 self.checkmethod('split', u'a b c d', [u'a', u'b c d'], None, 1)
184 self.checkmethod('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
185 self.checkmethod('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
186 self.checkmethod('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
187 self.checkmethod('split', u'a b c d', [u'a b c d'], None, 0)
188 self.checkmethod('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
189 self.checkmethod('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
190 self.checkmethod('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
191 self.checkmethod('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
192 self.checkmethod('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
193 self.checkmethod('split', u'endcase test', [u'endcase ', u''], u'test')
194 self.checkmethod('split', u'endcase test', [u'endcase ', u''], 'test')
195 self.checkmethod('split', 'endcase test', [u'endcase ', u''], u'test')
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000196
Walter Dörwald28256f22003-01-19 16:59:20 +0000197 self.assertRaises(TypeError, u"hello".split, 42, 42, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000198
Walter Dörwald28256f22003-01-19 16:59:20 +0000199 def test_join(self):
200 # join now works with any sequence type
201 class Sequence:
202 def __init__(self, seq): self.seq = seq
203 def __len__(self): return len(self.seq)
204 def __getitem__(self, i): return self.seq[i]
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000205
Walter Dörwald28256f22003-01-19 16:59:20 +0000206 self.checkmethod('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
207 self.checkmethod('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
208 self.checkmethod('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
209 self.checkmethod('join', u' ', u'w x y z', Sequence('wxyz'))
210 self.assertRaises(TypeError, u' '.join, 7)
211 self.assertRaises(TypeError, u' '.join, Sequence([7, u'hello', 123L]))
212 self.checkmethod('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
213 self.checkmethod('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
214 self.checkmethod('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
215 self.checkmethod('join', ' ', u'w x y z', Sequence(u'wxyz'))
216 self.assertRaises(TypeError, ' '.join, TypeError)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000217
Walter Dörwald28256f22003-01-19 16:59:20 +0000218 result = u''
219 for i in range(10):
220 if i > 0:
221 result = result + u':'
222 result = result + u'x'*10
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000223
Walter Dörwald28256f22003-01-19 16:59:20 +0000224 self.checkmethod('join', u':', result, [u'x' * 10] * 10)
225 self.checkmethod('join', u':', result, (u'x' * 10,) * 10)
Marc-André Lemburge5034372000-08-08 08:04:29 +0000226
Walter Dörwald28256f22003-01-19 16:59:20 +0000227 self.assertRaises(TypeError, u"hello".join)
Marc-André Lemburge5034372000-08-08 08:04:29 +0000228
Walter Dörwald28256f22003-01-19 16:59:20 +0000229 def test_strip(self):
230 self.checkmethod('strip', u' hello ', u'hello')
231 self.checkmethod('lstrip', u' hello ', u'hello ')
232 self.checkmethod('rstrip', u' hello ', u' hello')
233 self.checkmethod('strip', u'hello', u'hello')
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000234
Walter Dörwald28256f22003-01-19 16:59:20 +0000235 # strip/lstrip/rstrip with None arg
236 self.checkmethod('strip', u' hello ', u'hello', None)
237 self.checkmethod('lstrip', u' hello ', u'hello ', None)
238 self.checkmethod('rstrip', u' hello ', u' hello', None)
239 self.checkmethod('strip', u'hello', u'hello', None)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000240
Walter Dörwald28256f22003-01-19 16:59:20 +0000241 # strip/lstrip/rstrip with unicode arg
242 self.checkmethod('strip', u'xyzzyhelloxyzzy', u'hello', u'xyz')
243 self.checkmethod('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', u'xyz')
244 self.checkmethod('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', u'xyz')
245 self.checkmethod('strip', u'hello', u'hello', u'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000246
Walter Dörwald28256f22003-01-19 16:59:20 +0000247 # strip/lstrip/rstrip with str arg
248 self.checkmethod('strip', u'xyzzyhelloxyzzy', u'hello', 'xyz')
249 self.checkmethod('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', 'xyz')
250 self.checkmethod('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', 'xyz')
251 self.checkmethod('strip', u'hello', u'hello', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000252
Walter Dörwald28256f22003-01-19 16:59:20 +0000253 self.assertRaises(TypeError, u"hello".strip, 42, 42)
254 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
Guido van Rossuma831cac2000-03-10 23:23:21 +0000255
Walter Dörwald28256f22003-01-19 16:59:20 +0000256 def test_swapcase(self):
257 self.checkmethod('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000258
Walter Dörwald28256f22003-01-19 16:59:20 +0000259 self.assertRaises(TypeError, u"hello".swapcase, 42)
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000260
Walter Dörwald28256f22003-01-19 16:59:20 +0000261 def test_replace(self):
262 self.checkmethod('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
263 self.checkmethod('replace', u'one!two!three!', u'onetwothree', '!', '')
264 self.checkmethod('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
265 self.checkmethod('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
266 self.checkmethod('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
267 self.checkmethod('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
268 self.checkmethod('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
269 self.checkmethod('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
270 self.checkmethod('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
271 self.checkmethod('replace', u'abc', u'-a-b-c-', u'', u'-')
272 self.checkmethod('replace', u'abc', u'-a-b-c', u'', u'-', 3)
273 self.checkmethod('replace', u'abc', u'abc', u'', u'-', 0)
274 self.checkmethod('replace', u'abc', u'abc', u'ab', u'--', 0)
275 self.checkmethod('replace', u'abc', u'abc', u'xy', u'--')
276 self.checkmethod('replace', u'', u'', u'', u'')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000277
Walter Dörwald28256f22003-01-19 16:59:20 +0000278 # method call forwarded from str implementation because of unicode argument
279 self.checkmethod('replace', 'one!two!three!', u'one@two!three!', u'!', u'@', 1)
280 self.assertRaises(TypeError, 'replace'.replace, 42)
281 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000282
Walter Dörwald28256f22003-01-19 16:59:20 +0000283 self.assertRaises(TypeError, u"hello".replace)
284 self.assertRaises(TypeError, u"hello".replace, 42, u"h")
285 self.assertRaises(TypeError, u"hello".replace, u"h", 42)
Guido van Rossumd4d26842000-03-13 23:21:48 +0000286
Walter Dörwald28256f22003-01-19 16:59:20 +0000287 def test_startswith(self):
288 self.checkmethod('startswith', u'hello', True, u'he')
289 self.checkmethod('startswith', u'hello', True, u'hello')
290 self.checkmethod('startswith', u'hello', False, u'hello world')
291 self.checkmethod('startswith', u'hello', True, u'')
292 self.checkmethod('startswith', u'hello', False, u'ello')
293 self.checkmethod('startswith', u'hello', True, u'ello', 1)
294 self.checkmethod('startswith', u'hello', True, u'o', 4)
295 self.checkmethod('startswith', u'hello', False, u'o', 5)
296 self.checkmethod('startswith', u'hello', True, u'', 5)
297 self.checkmethod('startswith', u'hello', False, u'lo', 6)
298 self.checkmethod('startswith', u'helloworld', True, u'lowo', 3)
299 self.checkmethod('startswith', u'helloworld', True, u'lowo', 3, 7)
300 self.checkmethod('startswith', u'helloworld', False, u'lowo', 3, 6)
Marc-André Lemburg84625732000-06-13 12:05:36 +0000301
Walter Dörwald28256f22003-01-19 16:59:20 +0000302 self.assertRaises(TypeError, u"hello".startswith)
303 self.assertRaises(TypeError, u"hello".startswith, 42)
Marc-André Lemburg84625732000-06-13 12:05:36 +0000304
Walter Dörwald28256f22003-01-19 16:59:20 +0000305 def test_endswith(self):
306 self.checkmethod('endswith', u'hello', True, u'lo')
307 self.checkmethod('endswith', u'hello', False, u'he')
308 self.checkmethod('endswith', u'hello', True, u'')
309 self.checkmethod('endswith', u'hello', False, u'hello world')
310 self.checkmethod('endswith', u'helloworld', False, u'worl')
311 self.checkmethod('endswith', u'helloworld', True, u'worl', 3, 9)
312 self.checkmethod('endswith', u'helloworld', True, u'world', 3, 12)
313 self.checkmethod('endswith', u'helloworld', True, u'lowo', 1, 7)
314 self.checkmethod('endswith', u'helloworld', True, u'lowo', 2, 7)
315 self.checkmethod('endswith', u'helloworld', True, u'lowo', 3, 7)
316 self.checkmethod('endswith', u'helloworld', False, u'lowo', 4, 7)
317 self.checkmethod('endswith', u'helloworld', False, u'lowo', 3, 8)
318 self.checkmethod('endswith', u'ab', False, u'ab', 0, 1)
319 self.checkmethod('endswith', u'ab', False, u'ab', 0, 0)
320 self.checkmethod('endswith', 'helloworld', True, u'd')
321 self.checkmethod('endswith', 'helloworld', False, u'l')
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000322
Walter Dörwald28256f22003-01-19 16:59:20 +0000323 self.assertRaises(TypeError, u"hello".endswith)
324 self.assertRaises(TypeError, u"hello".endswith, 42)
325
326 def test_expandtabs(self):
327 self.checkmethod('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi')
328 self.checkmethod('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 8)
329 self.checkmethod('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 4)
330 self.checkmethod('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab def\ng hi', 4)
331 self.checkmethod('expandtabs', u'abc\r\nab\r\ndef\ng\r\nhi', u'abc\r\nab\r\ndef\ng\r\nhi', 4)
332
333 self.assertRaises(TypeError, u"hello".expandtabs, 42, 42)
334
335 def test_capwords(self):
336 if 0:
337 self.checkmethod('capwords', u'abc def ghi', u'Abc Def Ghi')
338 self.checkmethod('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
339 self.checkmethod('capwords', u'abc\t def \nghi', u'Abc Def Ghi')
340
341 def test_zfill(self):
342 self.checkmethod('zfill', u'123', u'123', 2)
343 self.checkmethod('zfill', u'123', u'123', 3)
344 self.checkmethod('zfill', u'123', u'0123', 4)
345 self.checkmethod('zfill', u'+123', u'+123', 3)
346 self.checkmethod('zfill', u'+123', u'+123', 4)
347 self.checkmethod('zfill', u'+123', u'+0123', 5)
348 self.checkmethod('zfill', u'-123', u'-123', 3)
349 self.checkmethod('zfill', u'-123', u'-123', 4)
350 self.checkmethod('zfill', u'-123', u'-0123', 5)
351 self.checkmethod('zfill', u'', u'000', 3)
352 self.checkmethod('zfill', u'34', u'34', 1)
353 self.checkmethod('zfill', u'34', u'00034', 5)
354
355 self.assertRaises(TypeError, u"123".zfill)
356
357 def test_comparison(self):
358 # Comparisons:
359 self.assertEqual(u'abc', 'abc')
360 self.assertEqual('abc', u'abc')
361 self.assertEqual(u'abc', u'abc')
362 self.assert_(u'abcd' > 'abc')
363 self.assert_('abcd' > u'abc')
364 self.assert_(u'abcd' > u'abc')
365 self.assert_(u'abc' < 'abcd')
366 self.assert_('abc' < u'abcd')
367 self.assert_(u'abc' < u'abcd')
368
369 if 0:
370 # Move these tests to a Unicode collation module test...
371 # Testing UTF-16 code point order comparisons...
372
373 # No surrogates, no fixup required.
374 self.assert_(u'\u0061' < u'\u20ac')
375 # Non surrogate below surrogate value, no fixup required
376 self.assert_(u'\u0061' < u'\ud800\udc02')
377
378 # Non surrogate above surrogate value, fixup required
379 def test_lecmp(s, s2):
380 self.assert_(s < s2)
381
382 def test_fixup(s):
383 s2 = u'\ud800\udc01'
384 test_lecmp(s, s2)
385 s2 = u'\ud900\udc01'
386 test_lecmp(s, s2)
387 s2 = u'\uda00\udc01'
388 test_lecmp(s, s2)
389 s2 = u'\udb00\udc01'
390 test_lecmp(s, s2)
391 s2 = u'\ud800\udd01'
392 test_lecmp(s, s2)
393 s2 = u'\ud900\udd01'
394 test_lecmp(s, s2)
395 s2 = u'\uda00\udd01'
396 test_lecmp(s, s2)
397 s2 = u'\udb00\udd01'
398 test_lecmp(s, s2)
399 s2 = u'\ud800\ude01'
400 test_lecmp(s, s2)
401 s2 = u'\ud900\ude01'
402 test_lecmp(s, s2)
403 s2 = u'\uda00\ude01'
404 test_lecmp(s, s2)
405 s2 = u'\udb00\ude01'
406 test_lecmp(s, s2)
407 s2 = u'\ud800\udfff'
408 test_lecmp(s, s2)
409 s2 = u'\ud900\udfff'
410 test_lecmp(s, s2)
411 s2 = u'\uda00\udfff'
412 test_lecmp(s, s2)
413 s2 = u'\udb00\udfff'
414 test_lecmp(s, s2)
415
416 test_fixup(u'\ue000')
417 test_fixup(u'\uff61')
418
419 # Surrogates on both sides, no fixup required
420 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
421
422 def test_ljust(self):
423 self.checkmethod('ljust', u'abc', u'abc ', 10)
424 self.checkmethod('ljust', u'abc', u'abc ', 6)
425 self.checkmethod('ljust', u'abc', u'abc', 2)
426
427 self.assertRaises(TypeError, u"abc".ljust)
428
429 def test_rjust(self):
430 self.checkmethod('rjust', u'abc', u' abc', 10)
431 self.checkmethod('rjust', u'abc', u' abc', 6)
432 self.checkmethod('rjust', u'abc', u'abc', 2)
433
434 self.assertRaises(TypeError, u"abc".rjust)
435
436 def test_center(self):
437 self.checkmethod('center', u'abc', u' abc ', 10)
438 self.checkmethod('center', u'abc', u' abc ', 6)
439 self.checkmethod('center', u'abc', u'abc', 2)
440
441 self.assertRaises(TypeError, u"abc".center)
442
443 def test_islower(self):
444 self.checkmethod('islower', u'', False)
445 self.checkmethod('islower', u'a', True)
446 self.checkmethod('islower', u'A', False)
447 self.checkmethod('islower', u'\n', False)
448 self.checkmethod('islower', u'\u1FFc', False)
449 self.checkmethod('islower', u'abc', True)
450 self.checkmethod('islower', u'aBc', False)
451 self.checkmethod('islower', u'abc\n', True)
452
453 self.assertRaises(TypeError, u"abc".islower, 42)
454
455 def test_isupper(self):
456 self.checkmethod('isupper', u'', False)
457 self.checkmethod('isupper', u'a', False)
458 self.checkmethod('isupper', u'A', True)
459 self.checkmethod('isupper', u'\n', False)
460 if sys.platform[:4] != 'java':
461 self.checkmethod('isupper', u'\u1FFc', False)
462 self.checkmethod('isupper', u'ABC', True)
463 self.checkmethod('isupper', u'AbC', False)
464 self.checkmethod('isupper', u'ABC\n', True)
465
466 self.assertRaises(TypeError, u"abc".isupper, 42)
467
468 def test_istitle(self):
469 self.checkmethod('istitle', u'', False)
470 self.checkmethod('istitle', u'a', False)
471 self.checkmethod('istitle', u'A', True)
472 self.checkmethod('istitle', u'\n', False)
473 self.checkmethod('istitle', u'\u1FFc', True)
474 self.checkmethod('istitle', u'A Titlecased Line', True)
475 self.checkmethod('istitle', u'A\nTitlecased Line', True)
476 self.checkmethod('istitle', u'A Titlecased, Line', True)
477 self.checkmethod('istitle', u'Greek \u1FFcitlecases ...', True)
478 self.checkmethod('istitle', u'Not a capitalized String', False)
479 self.checkmethod('istitle', u'Not\ta Titlecase String', False)
480 self.checkmethod('istitle', u'Not--a Titlecase String', False)
481 self.checkmethod('istitle', u'NOT', False)
482
483 self.assertRaises(TypeError, u"abc".istitle, 42)
484
485 def test_isspace(self):
486 self.checkmethod('isspace', u'', False)
487 self.checkmethod('isspace', u'a', False)
488 self.checkmethod('isspace', u' ', True)
489 self.checkmethod('isspace', u'\t', True)
490 self.checkmethod('isspace', u'\r', True)
491 self.checkmethod('isspace', u'\n', True)
492 self.checkmethod('isspace', u' \t\r\n', True)
493 self.checkmethod('isspace', u' \t\r\na', False)
494
495 self.assertRaises(TypeError, u"abc".isspace, 42)
496
497 def test_isalpha(self):
498 self.checkmethod('isalpha', u'', False)
499 self.checkmethod('isalpha', u'a', True)
500 self.checkmethod('isalpha', u'A', True)
501 self.checkmethod('isalpha', u'\n', False)
502 self.checkmethod('isalpha', u'\u1FFc', True)
503 self.checkmethod('isalpha', u'abc', True)
504 self.checkmethod('isalpha', u'aBc123', False)
505 self.checkmethod('isalpha', u'abc\n', False)
506
507 self.assertRaises(TypeError, u"abc".isalpha, 42)
508
509 def test_isalnum(self):
510 self.checkmethod('isalnum', u'', False)
511 self.checkmethod('isalnum', u'a', True)
512 self.checkmethod('isalnum', u'A', True)
513 self.checkmethod('isalnum', u'\n', False)
514 self.checkmethod('isalnum', u'123abc456', True)
515 self.checkmethod('isalnum', u'a1b3c', True)
516 self.checkmethod('isalnum', u'aBc000 ', False)
517 self.checkmethod('isalnum', u'abc\n', False)
518
519 self.assertRaises(TypeError, u"abc".isalnum, 42)
520
521 def test_isdecimal(self):
522 self.checkmethod('isdecimal', u'', False)
523 self.checkmethod('isdecimal', u'a', False)
524 self.checkmethod('isdecimal', u'0', True)
525 self.checkmethod('isdecimal', u'\u2460', False) # CIRCLED DIGIT ONE
526 self.checkmethod('isdecimal', u'\xbc', False) # VULGAR FRACTION ONE QUARTER
527 self.checkmethod('isdecimal', u'\u0660', True) # ARABIC-INDIC DIGIT ZERO
528 self.checkmethod('isdecimal', u'0123456789', True)
529 self.checkmethod('isdecimal', u'0123456789a', False)
530
531 self.assertRaises(TypeError, u"abc".isdecimal, 42)
532
533 def test_isdigit(self):
534 self.checkmethod('isdigit', u'', False)
535 self.checkmethod('isdigit', u'a', False)
536 self.checkmethod('isdigit', u'0', True)
537 self.checkmethod('isdigit', u'\u2460', True)
538 self.checkmethod('isdigit', u'\xbc', False)
539 self.checkmethod('isdigit', u'\u0660', True)
540 self.checkmethod('isdigit', u'0123456789', True)
541 self.checkmethod('isdigit', u'0123456789a', False)
542
543 self.assertRaises(TypeError, u"abc".isdigit, 42)
544
545 def test_isnumeric(self):
546 self.checkmethod('isnumeric', u'', False)
547 self.checkmethod('isnumeric', u'a', False)
548 self.checkmethod('isnumeric', u'0', True)
549 self.checkmethod('isnumeric', u'\u2460', True)
550 self.checkmethod('isnumeric', u'\xbc', True)
551 self.checkmethod('isnumeric', u'\u0660', True)
552 self.checkmethod('isnumeric', u'0123456789', True)
553 self.checkmethod('isnumeric', u'0123456789a', False)
554
555 self.assertRaises(TypeError, u"abc".isnumeric, 42)
556
557 def test_splitlines(self):
558 self.checkmethod('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
559 self.checkmethod('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
560 self.checkmethod('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
561 self.checkmethod('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
562 self.checkmethod('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
563 self.checkmethod('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
564 self.checkmethod('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], True)
565
566 self.assertRaises(TypeError, u"abc".splitlines, 42, 42)
567
568 def test_contains(self):
569 # Testing Unicode contains method
570 self.assert_('a' in u'abdb')
571 self.assert_('a' in u'bdab')
572 self.assert_('a' in u'bdaba')
573 self.assert_('a' in u'bdba')
574 self.assert_('a' in u'bdba')
575 self.assert_(u'a' in u'bdba')
576 self.assert_(u'a' not in u'bdb')
577 self.assert_(u'a' not in 'bdb')
578 self.assert_(u'a' in 'bdba')
579 self.assert_(u'a' in ('a',1,None))
580 self.assert_(u'a' in (1,None,'a'))
581 self.assert_(u'a' in (1,None,u'a'))
582 self.assert_('a' in ('a',1,None))
583 self.assert_('a' in (1,None,'a'))
584 self.assert_('a' in (1,None,u'a'))
585 self.assert_('a' not in ('x',1,u'y'))
586 self.assert_('a' not in ('x',1,None))
587 self.assert_(u'abcd' not in u'abcxxxx')
588 self.assert_(u'ab' in u'abcd')
589 self.assert_('ab' in u'abc')
590 self.assert_(u'ab' in 'abc')
591 self.assert_(u'ab' in (1,None,u'ab'))
592 self.assert_(u'' in u'abc')
593 self.assert_('' in u'abc')
594
595 # If the following fails either
596 # the contains operator does not propagate UnicodeErrors or
597 # someone has changed the default encoding
598 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
599
600 self.assert_(u'' in '')
601 self.assert_('' in u'')
602 self.assert_(u'' in u'')
603 self.assert_(u'' in 'abc')
604 self.assert_('' in u'abc')
605 self.assert_(u'' in u'abc')
606 self.assert_(u'\0' not in 'abc')
607 self.assert_('\0' not in u'abc')
608 self.assert_(u'\0' not in u'abc')
609 self.assert_(u'\0' in '\0abc')
610 self.assert_('\0' in u'\0abc')
611 self.assert_(u'\0' in u'\0abc')
612 self.assert_(u'\0' in 'abc\0')
613 self.assert_('\0' in u'abc\0')
614 self.assert_(u'\0' in u'abc\0')
615 self.assert_(u'a' in '\0abc')
616 self.assert_('a' in u'\0abc')
617 self.assert_(u'a' in u'\0abc')
618 self.assert_(u'asdf' in 'asdf')
619 self.assert_('asdf' in u'asdf')
620 self.assert_(u'asdf' in u'asdf')
621 self.assert_(u'asdf' not in 'asd')
622 self.assert_('asdf' not in u'asd')
623 self.assert_(u'asdf' not in u'asd')
624 self.assert_(u'asdf' not in '')
625 self.assert_('asdf' not in u'')
626 self.assert_(u'asdf' not in u'')
627
628 self.assertRaises(TypeError, u"abc".__contains__)
629
630 def test_formatting(self):
631 # Testing Unicode formatting strings...
632 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
633 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
634 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
635 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
636 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
637 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
638 self.assertEqual(u"%c" % (u"a",), u'a')
639 self.assertEqual(u"%c" % ("a",), u'a')
640 self.assertEqual(u"%c" % (34,), u'"')
641 self.assertEqual(u"%c" % (36,), u'$')
642 self.assertEqual(u"%d".__mod__(10), u'10')
643 if not sys.platform.startswith('java'):
644 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
645 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
646 self.assertEqual(u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}, u'abc, def')
647
648 for ordinal in (-100, 0x200000):
649 self.assertRaises(ValueError, u"%c".__mod__, ordinal)
650
651 # float formatting
652 for prec in xrange(100):
653 format = u'%%.%if' % prec
654 value = 0.01
655 for x in xrange(60):
656 value = value * 3.141592655 / 3.0 * 10.0
657 # The formatfloat() code in stringobject.c and
658 # unicodeobject.c uses a 120 byte buffer and switches from
659 # 'f' formatting to 'g' at precision 50, so we expect
660 # OverflowErrors for the ranges x < 50 and prec >= 67.
661 if x < 50 and prec >= 67:
662 self.assertRaises(OverflowError, format.__mod__, value)
663 else:
664 format % value
665
666 # formatting jobs delegated from the string implementation:
667 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
668 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
669 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
670 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
671 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
672 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
673 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
674 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
675 self.assertEqual('...%s...' % u"abc", u'...abc...')
676 self.assertEqual('%*s' % (5,u'abc',), u' abc')
677 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
678 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
679 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
680 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
681 self.assertEqual('%i%s %*.*s' % (10, 3, 5,3,u'abc',), u'103 abc')
682
683 self.assertEqual(u'%3ld' % 42, u' 42')
684 self.assertEqual(u'%07.2f' % 42, u'0042.00')
685
686 self.assertRaises(TypeError, u"abc".__mod__)
687 self.assertRaises(TypeError, u"%(foo)s".__mod__, 42)
688 self.assertRaises(TypeError, u"%s%s".__mod__, (42,))
689 self.assertRaises(TypeError, u"%c".__mod__, (None,))
690 self.assertRaises(ValueError, u"%c".__mod__, (sys.maxunicode+1,))
691 self.assertRaises(ValueError, u"%(foo".__mod__, {})
692 self.assertRaises(TypeError, u"%(foo)s %(bar)s".__mod__, (u"foo", 42))
693
694 # argument names with properly nested brackets are supported
695 self.assertEqual(u"%((foo))s" % {u"(foo)": u"bar"}, u"bar")
696
697 # 100 is a magic number in PyUnicode_Format, this forces a resize
698 self.assertEqual(u"%sx" % (103*u"a"), 103*u"a"+u"x")
699
700 self.assertRaises(TypeError, u"%*s".__mod__, (u"foo", u"bar"))
701 self.assertRaises(TypeError, u"%10.*f".__mod__, (u"foo", 42.))
702 self.assertRaises(ValueError, u"%10".__mod__, (42,))
703
704 def test_constructor(self):
705 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
706
707 self.assertEqual(
708 unicode(u'unicode remains unicode'),
709 u'unicode remains unicode'
710 )
711
712 class UnicodeSubclass(unicode):
Marc-André Lemburg79f57832002-12-29 19:44:06 +0000713 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +0000714
Walter Dörwald28256f22003-01-19 16:59:20 +0000715 self.assertEqual(
716 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
717 u'unicode subclass becomes unicode'
718 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000719
Walter Dörwald28256f22003-01-19 16:59:20 +0000720 self.assertEqual(
721 unicode('strings are converted to unicode'),
722 u'strings are converted to unicode'
723 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000724
Walter Dörwald28256f22003-01-19 16:59:20 +0000725 class UnicodeCompat:
726 def __init__(self, x):
727 self.x = x
728 def __unicode__(self):
729 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000730
Walter Dörwald28256f22003-01-19 16:59:20 +0000731 self.assertEqual(
732 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
733 u'__unicode__ compatible objects are recognized')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000734
Walter Dörwald28256f22003-01-19 16:59:20 +0000735 class StringCompat:
736 def __init__(self, x):
737 self.x = x
738 def __str__(self):
739 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000740
Walter Dörwald28256f22003-01-19 16:59:20 +0000741 self.assertEqual(
742 unicode(StringCompat('__str__ compatible objects are recognized')),
743 u'__str__ compatible objects are recognized'
744 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000745
Walter Dörwald28256f22003-01-19 16:59:20 +0000746 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000747
Walter Dörwald28256f22003-01-19 16:59:20 +0000748 o = StringCompat('unicode(obj) is compatible to str()')
749 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
750 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000751
Walter Dörwald28256f22003-01-19 16:59:20 +0000752 for obj in (123, 123.45, 123L):
753 self.assertEqual(unicode(obj), unicode(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000754
Walter Dörwald28256f22003-01-19 16:59:20 +0000755 # unicode(obj, encoding, error) tests (this maps to
756 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000757
Walter Dörwald28256f22003-01-19 16:59:20 +0000758 if not sys.platform.startswith('java'):
759 self.assertRaises(
760 TypeError,
761 unicode,
762 u'decoding unicode is not supported',
763 'utf-8',
764 'strict'
765 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000766
Walter Dörwald28256f22003-01-19 16:59:20 +0000767 self.assertEqual(
768 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
769 u'strings are decoded to unicode'
770 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000771
Walter Dörwald28256f22003-01-19 16:59:20 +0000772 if not sys.platform.startswith('java'):
773 self.assertEqual(
774 unicode(
775 buffer('character buffers are decoded to unicode'),
776 'utf-8',
777 'strict'
778 ),
779 u'character buffers are decoded to unicode'
780 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000781
Walter Dörwald28256f22003-01-19 16:59:20 +0000782 self.assertRaises(TypeError, unicode, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000783
Walter Dörwald28256f22003-01-19 16:59:20 +0000784 def test_codecs_utf7(self):
785 utfTests = [
786 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
787 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
788 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
789 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
790 (u'+', '+-'),
791 (u'+-', '+--'),
792 (u'+?', '+-?'),
793 (u'\?', '+AFw?'),
794 (u'+?', '+-?'),
795 (ur'\\?', '+AFwAXA?'),
796 (ur'\\\?', '+AFwAXABc?'),
797 (ur'++--', '+-+---')
798 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000799
Walter Dörwald28256f22003-01-19 16:59:20 +0000800 for (x, y) in utfTests:
801 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000802
Walter Dörwald28256f22003-01-19 16:59:20 +0000803 # surrogates not supported
804 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000805
Walter Dörwald28256f22003-01-19 16:59:20 +0000806 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000807
Walter Dörwald28256f22003-01-19 16:59:20 +0000808 def test_codecs_utf8(self):
809 self.assertEqual(u''.encode('utf-8'), '')
810 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
811 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
812 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
813 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
814 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
815 self.assertEqual(
816 (u'\ud800\udc02'*1000).encode('utf-8'),
817 '\xf0\x90\x80\x82'*1000
818 )
819 self.assertEqual(
820 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
821 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
822 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
823 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
824 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
825 u' Nunstuck git und'.encode('utf-8'),
826 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
827 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
828 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
829 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
830 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
831 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
832 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
833 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
834 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
835 '\xe3\x80\x8cWenn ist das Nunstuck git und'
836 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000837
Walter Dörwald28256f22003-01-19 16:59:20 +0000838 # UTF-8 specific decoding tests
839 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
840 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
841 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000842
Walter Dörwald28256f22003-01-19 16:59:20 +0000843 # Other possible utf-8 test cases:
844 # * strict decoding testing for all of the
845 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000846
Walter Dörwald28256f22003-01-19 16:59:20 +0000847 def test_codecs_errors(self):
848 # Error handling (encoding)
849 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
850 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
851 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
852 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000853
Walter Dörwald28256f22003-01-19 16:59:20 +0000854 # Error handling (decoding)
855 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
856 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
857 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
858 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000859
Walter Dörwald28256f22003-01-19 16:59:20 +0000860 # Error handling (unknown character names)
861 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000862
Walter Dörwald28256f22003-01-19 16:59:20 +0000863 # Error handling (truncated escape sequence)
864 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000865
Walter Dörwald28256f22003-01-19 16:59:20 +0000866 # Error handling (bad decoder return)
867 def search_function(encoding):
868 def decode1(input, errors="strict"):
869 return 42 # not a tuple
870 def encode1(input, errors="strict"):
871 return 42 # not a tuple
872 def encode2(input, errors="strict"):
873 return (42, 42) # no unicode
874 def decode2(input, errors="strict"):
875 return (42, 42) # no unicode
876 if encoding=="test.unicode1":
877 return (encode1, decode1, None, None)
878 elif encoding=="test.unicode2":
879 return (encode2, decode2, None, None)
880 else:
881 return None
882 codecs.register(search_function)
883 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
884 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
885 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
886 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
887 # executes PyUnicode_Encode()
888 import imp
889 self.assertRaises(
890 ImportError,
891 imp.find_module,
892 "non-existing module",
893 [u"non-existing dir"]
894 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000895
Walter Dörwald28256f22003-01-19 16:59:20 +0000896 # Error handling (wrong arguments)
897 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000898
Walter Dörwald28256f22003-01-19 16:59:20 +0000899 # Error handling (PyUnicode_EncodeDecimal())
900 self.assertRaises(UnicodeError, int, u"\u0200")
Guido van Rossum97064862000-04-10 13:52:48 +0000901
Walter Dörwald28256f22003-01-19 16:59:20 +0000902 def test_codecs(self):
903 # Encoding
904 self.assertEqual(u'hello'.encode('ascii'), 'hello')
905 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
906 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
907 self.assertEqual(u'hello'.encode('utf8'), 'hello')
908 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
909 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
910 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
Guido van Rossum97064862000-04-10 13:52:48 +0000911
Walter Dörwald28256f22003-01-19 16:59:20 +0000912 # Roundtrip safety for BMP (just the first 1024 chars)
913 u = u''.join(map(unichr, xrange(1024)))
914 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
915 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
916 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +0000917
Walter Dörwald28256f22003-01-19 16:59:20 +0000918 # Roundtrip safety for BMP (just the first 256 chars)
919 u = u''.join(map(unichr, xrange(256)))
920 for encoding in ('latin-1',):
921 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000922
Walter Dörwald28256f22003-01-19 16:59:20 +0000923 # Roundtrip safety for BMP (just the first 128 chars)
924 u = u''.join(map(unichr, xrange(128)))
925 for encoding in ('ascii',):
926 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000927
Walter Dörwald28256f22003-01-19 16:59:20 +0000928 # Roundtrip safety for non-BMP (just a few chars)
929 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
930 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
931 #'raw_unicode_escape',
932 'unicode_escape', 'unicode_internal'):
933 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000934
Walter Dörwald28256f22003-01-19 16:59:20 +0000935 # UTF-8 must be roundtrip safe for all UCS-2 code points
936 # This excludes surrogates: in the full range, there would be
937 # a surrogate pair (\udbff\udc00), which gets converted back
938 # to a non-BMP character (\U0010fc00)
939 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
940 for encoding in ('utf-8',):
941 self.assertEqual(unicode(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000942
Walter Dörwald28256f22003-01-19 16:59:20 +0000943 def test_codecs_charmap(self):
944 # 0-127
945 s = ''.join(map(chr, xrange(128)))
946 for encoding in (
947 'cp037', 'cp1026',
948 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
949 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
950 'cp863', 'cp865', 'cp866',
951 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
952 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
953 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
954 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000955
Walter Dörwald28256f22003-01-19 16:59:20 +0000956 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
957 'cp1256', 'cp1257', 'cp1258',
958 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000959
Walter Dörwald28256f22003-01-19 16:59:20 +0000960 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
961 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000962
Walter Dörwald28256f22003-01-19 16:59:20 +0000963 ### These have undefined mappings:
964 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000965
Walter Dörwald28256f22003-01-19 16:59:20 +0000966 ### These fail the round-trip:
967 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +0000968
Walter Dörwald28256f22003-01-19 16:59:20 +0000969 ):
970 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000971
Walter Dörwald28256f22003-01-19 16:59:20 +0000972 # 128-255
973 s = ''.join(map(chr, xrange(128, 256)))
974 for encoding in (
975 'cp037', 'cp1026',
976 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
977 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
978 'cp863', 'cp865', 'cp866',
979 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
980 'iso8859_2', 'iso8859_4', 'iso8859_5',
981 'iso8859_9', 'koi8_r', 'latin_1',
982 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000983
Walter Dörwald28256f22003-01-19 16:59:20 +0000984 ### These have undefined mappings:
985 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
986 #'cp1256', 'cp1257', 'cp1258',
987 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
988 #'iso8859_3', 'iso8859_6', 'iso8859_7',
989 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000990
Walter Dörwald28256f22003-01-19 16:59:20 +0000991 ### These fail the round-trip:
992 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +0000993
Walter Dörwald28256f22003-01-19 16:59:20 +0000994 ):
995 self.assertEqual(unicode(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +0000996
Walter Dörwald28256f22003-01-19 16:59:20 +0000997 def test_concatenation(self):
998 self.assertEqual((u"abc" u"def"), u"abcdef")
999 self.assertEqual(("abc" u"def"), u"abcdef")
1000 self.assertEqual((u"abc" "def"), u"abcdef")
1001 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
1002 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00001003
Walter Dörwald28256f22003-01-19 16:59:20 +00001004 def test_printing(self):
1005 class BitBucket:
1006 def write(self, text):
1007 pass
Fred Drake004d5e62000-10-23 17:22:08 +00001008
Walter Dörwald28256f22003-01-19 16:59:20 +00001009 out = BitBucket()
1010 print >>out, u'abc'
1011 print >>out, u'abc', u'def'
1012 print >>out, u'abc', 'def'
1013 print >>out, 'abc', u'def'
1014 print >>out, u'abc\n'
1015 print >>out, u'abc\n',
1016 print >>out, u'abc\n',
1017 print >>out, u'def\n'
1018 print >>out, u'def\n'
Fred Drake004d5e62000-10-23 17:22:08 +00001019
Walter Dörwald28256f22003-01-19 16:59:20 +00001020 def test_mul(self):
1021 self.checkmethod('__mul__', u'abc', u'', -1)
1022 self.checkmethod('__mul__', u'abc', u'', 0)
1023 self.checkmethod('__mul__', u'abc', u'abc', 1)
1024 self.checkmethod('__mul__', u'abc', u'abcabcabc', 3)
1025 self.assertRaises(OverflowError, (10000*u'abc').__mul__, sys.maxint)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001026
Walter Dörwald28256f22003-01-19 16:59:20 +00001027 def test_subscript(self):
1028 self.checkmethod('__getitem__', u'abc', u'a', 0)
1029 self.checkmethod('__getitem__', u'abc', u'c', -1)
1030 self.checkmethod('__getitem__', u'abc', u'a', 0L)
1031 self.checkmethod('__getitem__', u'abc', u'abc', slice(0, 3))
1032 self.checkmethod('__getitem__', u'abc', u'abc', slice(0, 1000))
1033 self.checkmethod('__getitem__', u'abc', u'a', slice(0, 1))
1034 self.checkmethod('__getitem__', u'abc', u'', slice(0, 0))
1035 # FIXME What about negative indizes? This is handled differently by [] and __getitem__(slice)
Fred Drakee0243e22000-04-13 14:11:56 +00001036
Walter Dörwald28256f22003-01-19 16:59:20 +00001037 self.assertRaises(TypeError, u"abc".__getitem__, "def")
Marc-André Lemburg0c4d8d02001-11-20 15:17:25 +00001038
Walter Dörwald28256f22003-01-19 16:59:20 +00001039 def test_slice(self):
1040 self.checkmethod('__getslice__', u'abc', u'abc', 0, 1000)
1041 self.checkmethod('__getslice__', u'abc', u'abc', 0, 3)
1042 self.checkmethod('__getslice__', u'abc', u'ab', 0, 2)
1043 self.checkmethod('__getslice__', u'abc', u'bc', 1, 3)
1044 self.checkmethod('__getslice__', u'abc', u'b', 1, 2)
1045 self.checkmethod('__getslice__', u'abc', u'', 2, 2)
1046 self.checkmethod('__getslice__', u'abc', u'', 1000, 1000)
1047 self.checkmethod('__getslice__', u'abc', u'', 2000, 1000)
1048 self.checkmethod('__getslice__', u'abc', u'', 2, 1)
1049 # FIXME What about negative indizes? This is handled differently by [] and __getslice__
Barry Warsaw817918c2002-08-06 16:58:21 +00001050
Walter Dörwald28256f22003-01-19 16:59:20 +00001051def test_main():
1052 suite = unittest.TestSuite()
1053 suite.addTest(unittest.makeSuite(UnicodeTest))
1054 test.test_support.run_suite(suite)
Barry Warsaw817918c2002-08-06 16:58:21 +00001055
Walter Dörwald28256f22003-01-19 16:59:20 +00001056if __name__ == "__main__":
1057 test_main()