blob: c04d929e1911df430cfce5fccc35a5a09d0b98d8 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Tim Peters2f228e72001-05-13 00:19:31 +00008from test_support import verify, verbose, TestFailed
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +00009import sys, string
Guido van Rossuma831cac2000-03-10 23:23:21 +000010
Finn Bock2b29cb22001-12-10 20:57:34 +000011if not sys.platform.startswith('java'):
12 # Test basic sanity of repr()
13 verify(repr(u'abc') == "u'abc'")
14 verify(repr(u'ab\\c') == "u'ab\\\\c'")
15 verify(repr(u'ab\\') == "u'ab\\\\'")
16 verify(repr(u'\\c') == "u'\\\\c'")
17 verify(repr(u'\\') == "u'\\\\'")
18 verify(repr(u'\n') == "u'\\n'")
19 verify(repr(u'\r') == "u'\\r'")
20 verify(repr(u'\t') == "u'\\t'")
21 verify(repr(u'\b') == "u'\\x08'")
22 verify(repr(u"'\"") == """u'\\'"'""")
23 verify(repr(u"'\"") == """u'\\'"'""")
24 verify(repr(u"'") == '''u"'"''')
25 verify(repr(u'"') == """u'"'""")
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +000026 latin1repr = (
27 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
28 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
29 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
30 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
31 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
32 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
33 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
34 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
35 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
36 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
37 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
38 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
39 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
40 "\\xfe\\xff'")
41 testrepr = repr(u''.join(map(unichr, range(256))))
42 verify(testrepr == latin1repr)
Guido van Rossume4874ae2001-09-21 15:36:41 +000043
Guido van Rossuma831cac2000-03-10 23:23:21 +000044def test(method, input, output, *args):
45 if verbose:
Guido van Rossum15ffc712000-11-29 12:13:59 +000046 print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
Guido van Rossuma831cac2000-03-10 23:23:21 +000047 try:
48 f = getattr(input, method)
49 value = apply(f, args)
50 except:
51 value = sys.exc_type
Guido van Rossum66503202000-04-28 20:39:58 +000052 exc = sys.exc_info()[:2]
Guido van Rossuma831cac2000-03-10 23:23:21 +000053 else:
54 exc = None
Guido van Rossum15ffc712000-11-29 12:13:59 +000055 if value != output or type(value) is not type(output):
Guido van Rossuma831cac2000-03-10 23:23:21 +000056 if verbose:
57 print 'no'
58 print '*',f, `input`, `output`, `value`
59 if exc:
Guido van Rossum66503202000-04-28 20:39:58 +000060 print ' value == %s: %s' % (exc)
Guido van Rossuma831cac2000-03-10 23:23:21 +000061 else:
62 if verbose:
63 print 'yes'
64
65test('capitalize', u' hello ', u' hello ')
66test('capitalize', u'hello ', u'Hello ')
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000067test('capitalize', u'aaaa', u'Aaaa')
68test('capitalize', u'AaAa', u'Aaaa')
Guido van Rossuma831cac2000-03-10 23:23:21 +000069
Marc-André Lemburg3a645e42001-01-16 11:54:12 +000070test('count', u'aaa', 3, u'a')
71test('count', u'aaa', 0, u'b')
72test('count', 'aaa', 3, u'a')
73test('count', 'aaa', 0, u'b')
74test('count', u'aaa', 3, 'a')
75test('count', u'aaa', 0, 'b')
76
Guido van Rossuma831cac2000-03-10 23:23:21 +000077test('title', u' hello ', u' Hello ')
78test('title', u'hello ', u'Hello ')
79test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
80test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
81test('title', u"getInt", u'Getint')
82
83test('find', u'abcdefghiabc', 0, u'abc')
84test('find', u'abcdefghiabc', 9, u'abc', 1)
85test('find', u'abcdefghiabc', -1, u'def', 4)
86
87test('rfind', u'abcdefghiabc', 9, u'abc')
88
89test('lower', u'HeLLo', u'hello')
90test('lower', u'hello', u'hello')
91
92test('upper', u'HeLLo', u'HELLO')
93test('upper', u'HELLO', u'HELLO')
94
95if 0:
96 transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
97
98 test('maketrans', u'abc', transtable, u'xyz')
99 test('maketrans', u'abc', ValueError, u'xyzq')
100
101test('split', u'this is the split function',
102 [u'this', u'is', u'the', u'split', u'function'])
103test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
104test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
105test('split', u'a b c d', [u'a', u'b c d'], None, 1)
106test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
107test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
108test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
109test('split', u'a b c d', [u'a b c d'], None, 0)
110test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
111test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
Guido van Rossum8b264542000-12-19 02:22:31 +0000112test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
113test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
114test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
115test('split', u'endcase test', [u'endcase ', u''], u'test')
116test('split', u'endcase test', [u'endcase ', u''], 'test')
117test('split', 'endcase test', [u'endcase ', u''], u'test')
118
Guido van Rossuma831cac2000-03-10 23:23:21 +0000119
120# join now works with any sequence type
121class Sequence:
Guido van Rossum15ffc712000-11-29 12:13:59 +0000122 def __init__(self, seq): self.seq = seq
Guido van Rossuma831cac2000-03-10 23:23:21 +0000123 def __len__(self): return len(self.seq)
124 def __getitem__(self, i): return self.seq[i]
125
126test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
Guido van Rossum15ffc712000-11-29 12:13:59 +0000127test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
Guido van Rossuma831cac2000-03-10 23:23:21 +0000128test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
Guido van Rossum15ffc712000-11-29 12:13:59 +0000129test('join', u' ', u'w x y z', Sequence('wxyz'))
Guido van Rossuma831cac2000-03-10 23:23:21 +0000130test('join', u' ', TypeError, 7)
Guido van Rossum15ffc712000-11-29 12:13:59 +0000131test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
132test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
133test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
134test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
135test('join', ' ', u'w x y z', Sequence(u'wxyz'))
136test('join', ' ', TypeError, 7)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000137
138result = u''
139for i in range(10):
140 if i > 0:
141 result = result + u':'
142 result = result + u'x'*10
143test('join', u':', result, [u'x' * 10] * 10)
144test('join', u':', result, (u'x' * 10,) * 10)
145
146test('strip', u' hello ', u'hello')
147test('lstrip', u' hello ', u'hello ')
148test('rstrip', u' hello ', u' hello')
149test('strip', u'hello', u'hello')
150
151test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
152
153if 0:
154 test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
155
156 table = string.maketrans('a', u'A')
157 test('translate', u'abc', u'Abc', table)
158 test('translate', u'xyz', u'xyz', table)
159
160test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
Barry Warsaw51ac5802000-03-20 16:36:48 +0000161test('replace', u'one!two!three!', u'onetwothree', '!', '')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000162test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
163test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
164test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
165test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
166test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
167test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
168test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
169
Guido van Rossum77f6a652002-04-03 22:41:51 +0000170test('startswith', u'hello', True, u'he')
171test('startswith', u'hello', True, u'hello')
172test('startswith', u'hello', False, u'hello world')
173test('startswith', u'hello', True, u'')
174test('startswith', u'hello', False, u'ello')
175test('startswith', u'hello', True, u'ello', 1)
176test('startswith', u'hello', True, u'o', 4)
177test('startswith', u'hello', False, u'o', 5)
178test('startswith', u'hello', True, u'', 5)
179test('startswith', u'hello', False, u'lo', 6)
180test('startswith', u'helloworld', True, u'lowo', 3)
181test('startswith', u'helloworld', True, u'lowo', 3, 7)
182test('startswith', u'helloworld', False, u'lowo', 3, 6)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000183
Guido van Rossum77f6a652002-04-03 22:41:51 +0000184test('endswith', u'hello', True, u'lo')
185test('endswith', u'hello', False, u'he')
186test('endswith', u'hello', True, u'')
187test('endswith', u'hello', False, u'hello world')
188test('endswith', u'helloworld', False, u'worl')
189test('endswith', u'helloworld', True, u'worl', 3, 9)
190test('endswith', u'helloworld', True, u'world', 3, 12)
191test('endswith', u'helloworld', True, u'lowo', 1, 7)
192test('endswith', u'helloworld', True, u'lowo', 2, 7)
193test('endswith', u'helloworld', True, u'lowo', 3, 7)
194test('endswith', u'helloworld', False, u'lowo', 4, 7)
195test('endswith', u'helloworld', False, u'lowo', 3, 8)
196test('endswith', u'ab', False, u'ab', 0, 1)
197test('endswith', u'ab', False, u'ab', 0, 0)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000198
199test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi')
200test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 8)
201test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 4)
202test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab def\ng hi', 4)
203
204if 0:
205 test('capwords', u'abc def ghi', u'Abc Def Ghi')
206 test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
207 test('capwords', u'abc\t def \nghi', u'Abc Def Ghi')
208
Walter Dörwald068325e2002-04-15 13:36:47 +0000209test('zfill', u'123', u'123', 2)
210test('zfill', u'123', u'123', 3)
211test('zfill', u'123', u'0123', 4)
212test('zfill', u'+123', u'+123', 3)
213test('zfill', u'+123', u'+123', 4)
214test('zfill', u'+123', u'+0123', 5)
215test('zfill', u'-123', u'-123', 3)
216test('zfill', u'-123', u'-123', 4)
217test('zfill', u'-123', u'-0123', 5)
218test('zfill', u'', u'000', 3)
219test('zfill', u'34', u'34', 1)
220test('zfill', u'34', u'00034', 5)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000221
Guido van Rossuma831cac2000-03-10 23:23:21 +0000222# Comparisons:
223print 'Testing Unicode comparisons...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000224verify(u'abc' == 'abc')
225verify('abc' == u'abc')
226verify(u'abc' == u'abc')
227verify(u'abcd' > 'abc')
228verify('abcd' > u'abc')
229verify(u'abcd' > u'abc')
230verify(u'abc' < 'abcd')
231verify('abc' < u'abcd')
232verify(u'abc' < u'abcd')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000233print 'done.'
234
Marc-André Lemburge5034372000-08-08 08:04:29 +0000235if 0:
236 # Move these tests to a Unicode collation module test...
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000237
Marc-André Lemburge5034372000-08-08 08:04:29 +0000238 print 'Testing UTF-16 code point order comparisons...',
239 #No surrogates, no fixup required.
Marc-André Lemburg36619082001-01-17 19:11:13 +0000240 verify(u'\u0061' < u'\u20ac')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000241 # Non surrogate below surrogate value, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000242 verify(u'\u0061' < u'\ud800\udc02')
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000243
Marc-André Lemburge5034372000-08-08 08:04:29 +0000244 # Non surrogate above surrogate value, fixup required
245 def test_lecmp(s, s2):
Tim Petersd2bf3b72001-01-18 02:22:22 +0000246 verify(s < s2 , "comparison failed on %s < %s" % (s, s2))
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000247
Marc-André Lemburge5034372000-08-08 08:04:29 +0000248 def test_fixup(s):
Fred Drake004d5e62000-10-23 17:22:08 +0000249 s2 = u'\ud800\udc01'
250 test_lecmp(s, s2)
251 s2 = u'\ud900\udc01'
252 test_lecmp(s, s2)
253 s2 = u'\uda00\udc01'
254 test_lecmp(s, s2)
255 s2 = u'\udb00\udc01'
256 test_lecmp(s, s2)
257 s2 = u'\ud800\udd01'
258 test_lecmp(s, s2)
259 s2 = u'\ud900\udd01'
260 test_lecmp(s, s2)
261 s2 = u'\uda00\udd01'
262 test_lecmp(s, s2)
263 s2 = u'\udb00\udd01'
264 test_lecmp(s, s2)
265 s2 = u'\ud800\ude01'
266 test_lecmp(s, s2)
267 s2 = u'\ud900\ude01'
268 test_lecmp(s, s2)
269 s2 = u'\uda00\ude01'
270 test_lecmp(s, s2)
271 s2 = u'\udb00\ude01'
272 test_lecmp(s, s2)
273 s2 = u'\ud800\udfff'
274 test_lecmp(s, s2)
275 s2 = u'\ud900\udfff'
276 test_lecmp(s, s2)
277 s2 = u'\uda00\udfff'
278 test_lecmp(s, s2)
279 s2 = u'\udb00\udfff'
280 test_lecmp(s, s2)
Marc-André Lemburge5034372000-08-08 08:04:29 +0000281
282 test_fixup(u'\ue000')
283 test_fixup(u'\uff61')
284
285 # Surrogates on both sides, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000286 verify(u'\ud800\udc02' < u'\ud84d\udc56')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000287 print 'done.'
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000288
Guido van Rossuma831cac2000-03-10 23:23:21 +0000289test('ljust', u'abc', u'abc ', 10)
290test('rjust', u'abc', u' abc', 10)
291test('center', u'abc', u' abc ', 10)
292test('ljust', u'abc', u'abc ', 6)
293test('rjust', u'abc', u' abc', 6)
294test('center', u'abc', u' abc ', 6)
295test('ljust', u'abc', u'abc', 2)
296test('rjust', u'abc', u'abc', 2)
297test('center', u'abc', u'abc', 2)
298
Guido van Rossum77f6a652002-04-03 22:41:51 +0000299test('islower', u'a', True)
300test('islower', u'A', False)
301test('islower', u'\n', False)
302test('islower', u'\u1FFc', False)
303test('islower', u'abc', True)
304test('islower', u'aBc', False)
305test('islower', u'abc\n', True)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000306
Guido van Rossum77f6a652002-04-03 22:41:51 +0000307test('isupper', u'a', False)
308test('isupper', u'A', True)
309test('isupper', u'\n', False)
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000310if sys.platform[:4] != 'java':
Guido van Rossum77f6a652002-04-03 22:41:51 +0000311 test('isupper', u'\u1FFc', False)
312test('isupper', u'ABC', True)
313test('isupper', u'AbC', False)
314test('isupper', u'ABC\n', True)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000315
Guido van Rossum77f6a652002-04-03 22:41:51 +0000316test('istitle', u'a', False)
317test('istitle', u'A', True)
318test('istitle', u'\n', False)
319test('istitle', u'\u1FFc', True)
320test('istitle', u'A Titlecased Line', True)
321test('istitle', u'A\nTitlecased Line', True)
322test('istitle', u'A Titlecased, Line', True)
323test('istitle', u'Greek \u1FFcitlecases ...', True)
324test('istitle', u'Not a capitalized String', False)
325test('istitle', u'Not\ta Titlecase String', False)
326test('istitle', u'Not--a Titlecase String', False)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000327
Guido van Rossum77f6a652002-04-03 22:41:51 +0000328test('isalpha', u'a', True)
329test('isalpha', u'A', True)
330test('isalpha', u'\n', False)
331test('isalpha', u'\u1FFc', True)
332test('isalpha', u'abc', True)
333test('isalpha', u'aBc123', False)
334test('isalpha', u'abc\n', False)
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000335
Guido van Rossum77f6a652002-04-03 22:41:51 +0000336test('isalnum', u'a', True)
337test('isalnum', u'A', True)
338test('isalnum', u'\n', False)
339test('isalnum', u'123abc456', True)
340test('isalnum', u'a1b3c', True)
341test('isalnum', u'aBc000 ', False)
342test('isalnum', u'abc\n', False)
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000343
Guido van Rossuma831cac2000-03-10 23:23:21 +0000344test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
345test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
346test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
347test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
348test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
349test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
Guido van Rossum77f6a652002-04-03 22:41:51 +0000350test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], True)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000351
352test('translate', u"abababc", u'bbbc', {ord('a'):None})
353test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
354test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
355
Guido van Rossumd4d26842000-03-13 23:21:48 +0000356# Contains:
357print 'Testing Unicode contains method...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000358verify(('a' in u'abdb') == 1)
359verify(('a' in u'bdab') == 1)
360verify(('a' in u'bdaba') == 1)
361verify(('a' in u'bdba') == 1)
362verify(('a' in u'bdba') == 1)
363verify((u'a' in u'bdba') == 1)
364verify((u'a' in u'bdb') == 0)
365verify((u'a' in 'bdb') == 0)
366verify((u'a' in 'bdba') == 1)
367verify((u'a' in ('a',1,None)) == 1)
368verify((u'a' in (1,None,'a')) == 1)
369verify((u'a' in (1,None,u'a')) == 1)
370verify(('a' in ('a',1,None)) == 1)
371verify(('a' in (1,None,'a')) == 1)
372verify(('a' in (1,None,u'a')) == 1)
373verify(('a' in ('x',1,u'y')) == 0)
374verify(('a' in ('x',1,None)) == 0)
Guido van Rossumd4d26842000-03-13 23:21:48 +0000375print 'done.'
376
Guido van Rossuma831cac2000-03-10 23:23:21 +0000377# Formatting:
378print 'Testing Unicode formatting strings...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000379verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
380verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000, 3.00')
381verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000, 3.00')
382verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000, 3.50')
383verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000, 3.57')
384verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
385verify(u"%c" % (u"a",) == u'a')
386verify(u"%c" % ("a",) == u'a')
387verify(u"%c" % (34,) == u'"')
388verify(u"%c" % (36,) == u'$')
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000389if sys.platform[:4] != 'java':
390 value = u"%r, %r" % (u"abc", "abc")
391 if value != u"u'abc', 'abc'":
392 print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'
Marc-André Lemburg84625732000-06-13 12:05:36 +0000393
Marc-André Lemburg36619082001-01-17 19:11:13 +0000394verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000395try:
Marc-André Lemburg72f82132001-11-20 15:18:49 +0000396 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}
Marc-André Lemburg84625732000-06-13 12:05:36 +0000397except KeyError:
398 print '*** formatting failed for "%s"' % "u'abc, def'"
399else:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000400 verify(value == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000401
Guido van Rossum97064862000-04-10 13:52:48 +0000402# formatting jobs delegated from the string implementation:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000403verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
404verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
405verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
406verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
407verify('...%(foo)s...' % {u'foo':u"abc",'def':123} == u'...abc...')
408verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
409verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
410verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
411verify('...%s...' % u"abc" == u'...abc...')
Marc-André Lemburg542fe562001-05-02 14:21:53 +0000412verify('%*s' % (5,u'abc',) == u' abc')
413verify('%*s' % (-5,u'abc',) == u'abc ')
414verify('%*.*s' % (5,2,u'abc',) == u' ab')
415verify('%*.*s' % (5,3,u'abc',) == u' abc')
416verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10 abc')
417verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000418print 'done.'
419
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000420print 'Testing builtin unicode()...',
421
422# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
423
424verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
425
426class UnicodeSubclass(unicode):
427 pass
428
429verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
430 == u'unicode subclass becomes unicode')
431
432verify(unicode('strings are converted to unicode')
433 == u'strings are converted to unicode')
434
435class UnicodeCompat:
436 def __init__(self, x):
437 self.x = x
438 def __unicode__(self):
439 return self.x
440
441verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
442 == u'__unicode__ compatible objects are recognized')
443
444class StringCompat:
445 def __init__(self, x):
446 self.x = x
447 def __str__(self):
448 return self.x
449
450verify(unicode(StringCompat('__str__ compatible objects are recognized'))
451 == u'__str__ compatible objects are recognized')
452
453# unicode(obj) is compatible to str():
454
455o = StringCompat('unicode(obj) is compatible to str()')
456verify(unicode(o) == u'unicode(obj) is compatible to str()')
457verify(str(o) == 'unicode(obj) is compatible to str()')
458
459for obj in (123, 123.45, 123L):
460 verify(unicode(obj) == unicode(str(obj)))
461
462# unicode(obj, encoding, error) tests (this maps to
463# PyUnicode_FromEncodedObject() at C level)
464
Finn Bock2b29cb22001-12-10 20:57:34 +0000465if not sys.platform.startswith('java'):
466 try:
467 unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
468 except TypeError:
469 pass
470 else:
471 raise TestFailed, "decoding unicode should NOT be supported"
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000472
473verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
474 == u'strings are decoded to unicode')
475
Finn Bock2b29cb22001-12-10 20:57:34 +0000476if not sys.platform.startswith('java'):
477 verify(unicode(buffer('character buffers are decoded to unicode'),
478 'utf-8', 'strict')
479 == u'character buffers are decoded to unicode')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000480
481print 'done.'
482
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000483# Test builtin codecs
484print 'Testing builtin codecs...',
485
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000486# UTF-7 specific encoding tests:
487utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
488 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
489 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
490 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
491 (u'+', '+-'),
492 (u'+-', '+--'),
493 (u'+?', '+-?'),
494 (u'\?', '+AFw?'),
495 (u'+?', '+-?'),
496 (ur'\\?', '+AFwAXA?'),
497 (ur'\\\?', '+AFwAXABc?'),
498 (ur'++--', '+-+---')]
499
500for x,y in utfTests:
501 verify( x.encode('utf-7') == y )
502
Tim Peters527e64f2001-10-04 05:36:56 +0000503try:
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000504 unicode('+3ADYAA-', 'utf-7') # surrogates not supported
505except UnicodeError:
506 pass
507else:
508 raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
509
510verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
511
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000512# UTF-8 specific encoding tests:
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000513verify(u''.encode('utf-8') == '')
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000514verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
515verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
516verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
517verify(u'\ud800'.encode('utf-8') == '\xed\xa0\x80')
518verify(u'\udc00'.encode('utf-8') == '\xed\xb0\x80')
519verify((u'\ud800\udc02'*1000).encode('utf-8') ==
520 '\xf0\x90\x80\x82'*1000)
Marc-André Lemburgce0b6642002-04-10 17:18:02 +0000521verify(u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
522 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
523 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
524 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
525 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
Tim Peters863ac442002-04-16 01:38:40 +0000526 u' Nunstuck git und'.encode('utf-8') ==
Marc-André Lemburgce0b6642002-04-10 17:18:02 +0000527 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
528 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
529 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
530 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
531 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
532 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
533 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
534 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
535 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
536 '\xe3\x80\x8cWenn ist das Nunstuck git und')
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000537
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000538# UTF-8 specific decoding tests
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000539verify(unicode('\xf0\xa3\x91\x96', 'utf-8') == u'\U00023456' )
540verify(unicode('\xf0\x90\x80\x82', 'utf-8') == u'\U00010002' )
541verify(unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac' )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000542
543# Other possible utf-8 test cases:
544# * strict decoding testing for all of the
545# UTF8_ERROR cases in PyUnicode_DecodeUTF8
546
Marc-André Lemburg36619082001-01-17 19:11:13 +0000547verify(unicode('hello','ascii') == u'hello')
548verify(unicode('hello','utf-8') == u'hello')
549verify(unicode('hello','utf8') == u'hello')
550verify(unicode('hello','latin-1') == u'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000551
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000552# Error handling
Guido van Rossum97064862000-04-10 13:52:48 +0000553try:
554 u'Andr\202 x'.encode('ascii')
555 u'Andr\202 x'.encode('ascii','strict')
556except ValueError:
557 pass
558else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000559 raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000560verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
561verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")
Guido van Rossum97064862000-04-10 13:52:48 +0000562
563try:
564 unicode('Andr\202 x','ascii')
565 unicode('Andr\202 x','ascii','strict')
566except ValueError:
567 pass
568else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000569 raise TestFailed, "unicode('Andr\202') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000570verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
571verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
Guido van Rossum97064862000-04-10 13:52:48 +0000572
Martin v. Löwis047c05e2002-03-21 08:55:28 +0000573verify("\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx")
574try:
575 "\\".decode("unicode-escape")
576except ValueError:
577 pass
578else:
579 raise TestFailed, '"\\".decode("unicode-escape") should fail'
580
Marc-André Lemburg36619082001-01-17 19:11:13 +0000581verify(u'hello'.encode('ascii') == 'hello')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000582verify(u'hello'.encode('utf-7') == 'hello')
Marc-André Lemburg36619082001-01-17 19:11:13 +0000583verify(u'hello'.encode('utf-8') == 'hello')
584verify(u'hello'.encode('utf8') == 'hello')
585verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
586verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
587verify(u'hello'.encode('latin-1') == 'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000588
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000589# Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000590u = u''.join(map(unichr, range(1024)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000591for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000592 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
Marc-André Lemburg36619082001-01-17 19:11:13 +0000593 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000594
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000595# Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000596u = u''.join(map(unichr, range(256)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000597for encoding in (
598 'latin-1',
599 ):
600 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000601 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000602 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000603 print '*** codec "%s" failed round-trip' % encoding
604 except ValueError,why:
605 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000606
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000607# Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000608u = u''.join(map(unichr, range(128)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000609for encoding in (
610 'ascii',
611 ):
612 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000613 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000614 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000615 print '*** codec "%s" failed round-trip' % encoding
616 except ValueError,why:
617 print '*** codec for "%s" failed: %s' % (encoding, why)
618
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000619# Roundtrip safety for non-BMP (just a few chars)
620u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
621for encoding in ('utf-8',
622 'utf-16', 'utf-16-le', 'utf-16-be',
623 #'raw_unicode_escape',
624 'unicode_escape', 'unicode_internal'):
625 verify(unicode(u.encode(encoding),encoding) == u)
626
627# UTF-8 must be roundtrip safe for all UCS-2 code points
628u = u''.join(map(unichr, range(0x10000)))
629for encoding in ('utf-8',):
630 verify(unicode(u.encode(encoding),encoding) == u)
631
Guido van Rossum9e896b32000-04-05 20:11:21 +0000632print 'done.'
633
634print 'Testing standard mapping codecs...',
635
636print '0-127...',
637s = ''.join(map(chr, range(128)))
638for encoding in (
639 'cp037', 'cp1026',
640 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
641 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000642 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000643 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
644 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
645 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
646 'mac_cyrillic', 'mac_latin2',
647
648 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
649 'cp1256', 'cp1257', 'cp1258',
650 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
651
652 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Tim Peters2f228e72001-05-13 00:19:31 +0000653 'cp1006', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000654
Guido van Rossum9e896b32000-04-05 20:11:21 +0000655 ### These have undefined mappings:
656 #'cp424',
Fred Drake004d5e62000-10-23 17:22:08 +0000657
Tim Peters2f228e72001-05-13 00:19:31 +0000658 ### These fail the round-trip:
659 #'cp875'
660
Guido van Rossum9e896b32000-04-05 20:11:21 +0000661 ):
662 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000663 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000664 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000665 print '*** codec "%s" failed round-trip' % encoding
666 except ValueError,why:
667 print '*** codec for "%s" failed: %s' % (encoding, why)
668
669print '128-255...',
670s = ''.join(map(chr, range(128,256)))
671for encoding in (
672 'cp037', 'cp1026',
673 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
674 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000675 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000676 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000677 'iso8859_2', 'iso8859_4', 'iso8859_5',
Marc-André Lemburga866df82001-01-03 21:29:14 +0000678 'iso8859_9', 'koi8_r', 'latin_1',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000679 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000680
Guido van Rossum9e896b32000-04-05 20:11:21 +0000681 ### These have undefined mappings:
682 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
683 #'cp1256', 'cp1257', 'cp1258',
684 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000685 #'iso8859_3', 'iso8859_6', 'iso8859_7',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000686 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000687
Guido van Rossum9e896b32000-04-05 20:11:21 +0000688 ### These fail the round-trip:
689 #'cp1006', 'cp875', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000690
Guido van Rossum9e896b32000-04-05 20:11:21 +0000691 ):
692 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000693 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000694 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000695 print '*** codec "%s" failed round-trip' % encoding
696 except ValueError,why:
697 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000698
699print 'done.'
Fred Drakee0243e22000-04-13 14:11:56 +0000700
701print 'Testing Unicode string concatenation...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000702verify((u"abc" u"def") == u"abcdef")
703verify(("abc" u"def") == u"abcdef")
704verify((u"abc" "def") == u"abcdef")
705verify((u"abc" u"def" "ghi") == u"abcdefghi")
706verify(("abc" "def" u"ghi") == u"abcdefghi")
Fred Drakee0243e22000-04-13 14:11:56 +0000707print 'done.'
Marc-André Lemburg0c4d8d02001-11-20 15:17:25 +0000708
709print 'Testing Unicode printing...',
710print u'abc'
711print u'abc', u'def'
712print u'abc', 'def'
713print 'abc', u'def'
714print u'abc\n'
715print u'abc\n',
716print u'abc\n',
717print u'def\n'
718print u'def\n'
719print 'done.'