blob: 4b77e7531e33368646b98647d1a2a54db4acc073 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Tim Peters2f228e72001-05-13 00:19:31 +00008from test_support import verify, verbose, TestFailed
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +00009import sys, string
Guido van Rossuma831cac2000-03-10 23:23:21 +000010
Finn Bock2b29cb22001-12-10 20:57:34 +000011if not sys.platform.startswith('java'):
12 # Test basic sanity of repr()
13 verify(repr(u'abc') == "u'abc'")
14 verify(repr(u'ab\\c') == "u'ab\\\\c'")
15 verify(repr(u'ab\\') == "u'ab\\\\'")
16 verify(repr(u'\\c') == "u'\\\\c'")
17 verify(repr(u'\\') == "u'\\\\'")
18 verify(repr(u'\n') == "u'\\n'")
19 verify(repr(u'\r') == "u'\\r'")
20 verify(repr(u'\t') == "u'\\t'")
21 verify(repr(u'\b') == "u'\\x08'")
22 verify(repr(u"'\"") == """u'\\'"'""")
23 verify(repr(u"'\"") == """u'\\'"'""")
24 verify(repr(u"'") == '''u"'"''')
25 verify(repr(u'"') == """u'"'""")
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +000026 latin1repr = (
27 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
28 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
29 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
30 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
31 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
32 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
33 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
34 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
35 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
36 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
37 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
38 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
39 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
40 "\\xfe\\xff'")
41 testrepr = repr(u''.join(map(unichr, range(256))))
42 verify(testrepr == latin1repr)
Guido van Rossume4874ae2001-09-21 15:36:41 +000043
Guido van Rossuma831cac2000-03-10 23:23:21 +000044def test(method, input, output, *args):
45 if verbose:
Guido van Rossum15ffc712000-11-29 12:13:59 +000046 print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
Guido van Rossuma831cac2000-03-10 23:23:21 +000047 try:
48 f = getattr(input, method)
49 value = apply(f, args)
50 except:
51 value = sys.exc_type
Guido van Rossum66503202000-04-28 20:39:58 +000052 exc = sys.exc_info()[:2]
Guido van Rossuma831cac2000-03-10 23:23:21 +000053 else:
54 exc = None
Guido van Rossum15ffc712000-11-29 12:13:59 +000055 if value != output or type(value) is not type(output):
Guido van Rossuma831cac2000-03-10 23:23:21 +000056 if verbose:
57 print 'no'
58 print '*',f, `input`, `output`, `value`
59 if exc:
Guido van Rossum66503202000-04-28 20:39:58 +000060 print ' value == %s: %s' % (exc)
Guido van Rossuma831cac2000-03-10 23:23:21 +000061 else:
62 if verbose:
63 print 'yes'
64
65test('capitalize', u' hello ', u' hello ')
66test('capitalize', u'hello ', u'Hello ')
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000067test('capitalize', u'aaaa', u'Aaaa')
68test('capitalize', u'AaAa', u'Aaaa')
Guido van Rossuma831cac2000-03-10 23:23:21 +000069
Marc-André Lemburg3a645e42001-01-16 11:54:12 +000070test('count', u'aaa', 3, u'a')
71test('count', u'aaa', 0, u'b')
72test('count', 'aaa', 3, u'a')
73test('count', 'aaa', 0, u'b')
74test('count', u'aaa', 3, 'a')
75test('count', u'aaa', 0, 'b')
76
Guido van Rossuma831cac2000-03-10 23:23:21 +000077test('title', u' hello ', u' Hello ')
78test('title', u'hello ', u'Hello ')
79test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
80test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
81test('title', u"getInt", u'Getint')
82
83test('find', u'abcdefghiabc', 0, u'abc')
84test('find', u'abcdefghiabc', 9, u'abc', 1)
85test('find', u'abcdefghiabc', -1, u'def', 4)
86
87test('rfind', u'abcdefghiabc', 9, u'abc')
88
89test('lower', u'HeLLo', u'hello')
90test('lower', u'hello', u'hello')
91
92test('upper', u'HeLLo', u'HELLO')
93test('upper', u'HELLO', u'HELLO')
94
95if 0:
96 transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
97
98 test('maketrans', u'abc', transtable, u'xyz')
99 test('maketrans', u'abc', ValueError, u'xyzq')
100
101test('split', u'this is the split function',
102 [u'this', u'is', u'the', u'split', u'function'])
103test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
104test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
105test('split', u'a b c d', [u'a', u'b c d'], None, 1)
106test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
107test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
108test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
109test('split', u'a b c d', [u'a b c d'], None, 0)
110test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
111test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
Guido van Rossum8b264542000-12-19 02:22:31 +0000112test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
113test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
114test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
115test('split', u'endcase test', [u'endcase ', u''], u'test')
116test('split', u'endcase test', [u'endcase ', u''], 'test')
117test('split', 'endcase test', [u'endcase ', u''], u'test')
118
Guido van Rossuma831cac2000-03-10 23:23:21 +0000119
120# join now works with any sequence type
121class Sequence:
Guido van Rossum15ffc712000-11-29 12:13:59 +0000122 def __init__(self, seq): self.seq = seq
Guido van Rossuma831cac2000-03-10 23:23:21 +0000123 def __len__(self): return len(self.seq)
124 def __getitem__(self, i): return self.seq[i]
125
126test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
Guido van Rossum15ffc712000-11-29 12:13:59 +0000127test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
Guido van Rossuma831cac2000-03-10 23:23:21 +0000128test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
Guido van Rossum15ffc712000-11-29 12:13:59 +0000129test('join', u' ', u'w x y z', Sequence('wxyz'))
Guido van Rossuma831cac2000-03-10 23:23:21 +0000130test('join', u' ', TypeError, 7)
Guido van Rossum15ffc712000-11-29 12:13:59 +0000131test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
132test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
133test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
134test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
135test('join', ' ', u'w x y z', Sequence(u'wxyz'))
136test('join', ' ', TypeError, 7)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000137
138result = u''
139for i in range(10):
140 if i > 0:
141 result = result + u':'
142 result = result + u'x'*10
143test('join', u':', result, [u'x' * 10] * 10)
144test('join', u':', result, (u'x' * 10,) * 10)
145
146test('strip', u' hello ', u'hello')
147test('lstrip', u' hello ', u'hello ')
148test('rstrip', u' hello ', u' hello')
149test('strip', u'hello', u'hello')
150
151test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
152
153if 0:
154 test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
155
156 table = string.maketrans('a', u'A')
157 test('translate', u'abc', u'Abc', table)
158 test('translate', u'xyz', u'xyz', table)
159
160test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
Barry Warsaw51ac5802000-03-20 16:36:48 +0000161test('replace', u'one!two!three!', u'onetwothree', '!', '')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000162test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
163test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
164test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
165test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
166test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
167test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
168test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
169
Guido van Rossum77f6a652002-04-03 22:41:51 +0000170test('startswith', u'hello', True, u'he')
171test('startswith', u'hello', True, u'hello')
172test('startswith', u'hello', False, u'hello world')
173test('startswith', u'hello', True, u'')
174test('startswith', u'hello', False, u'ello')
175test('startswith', u'hello', True, u'ello', 1)
176test('startswith', u'hello', True, u'o', 4)
177test('startswith', u'hello', False, u'o', 5)
178test('startswith', u'hello', True, u'', 5)
179test('startswith', u'hello', False, u'lo', 6)
180test('startswith', u'helloworld', True, u'lowo', 3)
181test('startswith', u'helloworld', True, u'lowo', 3, 7)
182test('startswith', u'helloworld', False, u'lowo', 3, 6)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000183
Guido van Rossum77f6a652002-04-03 22:41:51 +0000184test('endswith', u'hello', True, u'lo')
185test('endswith', u'hello', False, u'he')
186test('endswith', u'hello', True, u'')
187test('endswith', u'hello', False, u'hello world')
188test('endswith', u'helloworld', False, u'worl')
189test('endswith', u'helloworld', True, u'worl', 3, 9)
190test('endswith', u'helloworld', True, u'world', 3, 12)
191test('endswith', u'helloworld', True, u'lowo', 1, 7)
192test('endswith', u'helloworld', True, u'lowo', 2, 7)
193test('endswith', u'helloworld', True, u'lowo', 3, 7)
194test('endswith', u'helloworld', False, u'lowo', 4, 7)
195test('endswith', u'helloworld', False, u'lowo', 3, 8)
196test('endswith', u'ab', False, u'ab', 0, 1)
197test('endswith', u'ab', False, u'ab', 0, 0)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000198
199test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi')
200test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 8)
201test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 4)
202test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab def\ng hi', 4)
203
204if 0:
205 test('capwords', u'abc def ghi', u'Abc Def Ghi')
206 test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
207 test('capwords', u'abc\t def \nghi', u'Abc Def Ghi')
208
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000209verify(string.zfill(u'34', 1) == u'34')
210verify(string.zfill(u'34', 5) == u'00034')
211
Guido van Rossuma831cac2000-03-10 23:23:21 +0000212# Comparisons:
213print 'Testing Unicode comparisons...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000214verify(u'abc' == 'abc')
215verify('abc' == u'abc')
216verify(u'abc' == u'abc')
217verify(u'abcd' > 'abc')
218verify('abcd' > u'abc')
219verify(u'abcd' > u'abc')
220verify(u'abc' < 'abcd')
221verify('abc' < u'abcd')
222verify(u'abc' < u'abcd')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000223print 'done.'
224
Marc-André Lemburge5034372000-08-08 08:04:29 +0000225if 0:
226 # Move these tests to a Unicode collation module test...
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000227
Marc-André Lemburge5034372000-08-08 08:04:29 +0000228 print 'Testing UTF-16 code point order comparisons...',
229 #No surrogates, no fixup required.
Marc-André Lemburg36619082001-01-17 19:11:13 +0000230 verify(u'\u0061' < u'\u20ac')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000231 # Non surrogate below surrogate value, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000232 verify(u'\u0061' < u'\ud800\udc02')
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000233
Marc-André Lemburge5034372000-08-08 08:04:29 +0000234 # Non surrogate above surrogate value, fixup required
235 def test_lecmp(s, s2):
Tim Petersd2bf3b72001-01-18 02:22:22 +0000236 verify(s < s2 , "comparison failed on %s < %s" % (s, s2))
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000237
Marc-André Lemburge5034372000-08-08 08:04:29 +0000238 def test_fixup(s):
Fred Drake004d5e62000-10-23 17:22:08 +0000239 s2 = u'\ud800\udc01'
240 test_lecmp(s, s2)
241 s2 = u'\ud900\udc01'
242 test_lecmp(s, s2)
243 s2 = u'\uda00\udc01'
244 test_lecmp(s, s2)
245 s2 = u'\udb00\udc01'
246 test_lecmp(s, s2)
247 s2 = u'\ud800\udd01'
248 test_lecmp(s, s2)
249 s2 = u'\ud900\udd01'
250 test_lecmp(s, s2)
251 s2 = u'\uda00\udd01'
252 test_lecmp(s, s2)
253 s2 = u'\udb00\udd01'
254 test_lecmp(s, s2)
255 s2 = u'\ud800\ude01'
256 test_lecmp(s, s2)
257 s2 = u'\ud900\ude01'
258 test_lecmp(s, s2)
259 s2 = u'\uda00\ude01'
260 test_lecmp(s, s2)
261 s2 = u'\udb00\ude01'
262 test_lecmp(s, s2)
263 s2 = u'\ud800\udfff'
264 test_lecmp(s, s2)
265 s2 = u'\ud900\udfff'
266 test_lecmp(s, s2)
267 s2 = u'\uda00\udfff'
268 test_lecmp(s, s2)
269 s2 = u'\udb00\udfff'
270 test_lecmp(s, s2)
Marc-André Lemburge5034372000-08-08 08:04:29 +0000271
272 test_fixup(u'\ue000')
273 test_fixup(u'\uff61')
274
275 # Surrogates on both sides, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000276 verify(u'\ud800\udc02' < u'\ud84d\udc56')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000277 print 'done.'
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000278
Guido van Rossuma831cac2000-03-10 23:23:21 +0000279test('ljust', u'abc', u'abc ', 10)
280test('rjust', u'abc', u' abc', 10)
281test('center', u'abc', u' abc ', 10)
282test('ljust', u'abc', u'abc ', 6)
283test('rjust', u'abc', u' abc', 6)
284test('center', u'abc', u' abc ', 6)
285test('ljust', u'abc', u'abc', 2)
286test('rjust', u'abc', u'abc', 2)
287test('center', u'abc', u'abc', 2)
288
Guido van Rossum77f6a652002-04-03 22:41:51 +0000289test('islower', u'a', True)
290test('islower', u'A', False)
291test('islower', u'\n', False)
292test('islower', u'\u1FFc', False)
293test('islower', u'abc', True)
294test('islower', u'aBc', False)
295test('islower', u'abc\n', True)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000296
Guido van Rossum77f6a652002-04-03 22:41:51 +0000297test('isupper', u'a', False)
298test('isupper', u'A', True)
299test('isupper', u'\n', False)
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000300if sys.platform[:4] != 'java':
Guido van Rossum77f6a652002-04-03 22:41:51 +0000301 test('isupper', u'\u1FFc', False)
302test('isupper', u'ABC', True)
303test('isupper', u'AbC', False)
304test('isupper', u'ABC\n', True)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000305
Guido van Rossum77f6a652002-04-03 22:41:51 +0000306test('istitle', u'a', False)
307test('istitle', u'A', True)
308test('istitle', u'\n', False)
309test('istitle', u'\u1FFc', True)
310test('istitle', u'A Titlecased Line', True)
311test('istitle', u'A\nTitlecased Line', True)
312test('istitle', u'A Titlecased, Line', True)
313test('istitle', u'Greek \u1FFcitlecases ...', True)
314test('istitle', u'Not a capitalized String', False)
315test('istitle', u'Not\ta Titlecase String', False)
316test('istitle', u'Not--a Titlecase String', False)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000317
Guido van Rossum77f6a652002-04-03 22:41:51 +0000318test('isalpha', u'a', True)
319test('isalpha', u'A', True)
320test('isalpha', u'\n', False)
321test('isalpha', u'\u1FFc', True)
322test('isalpha', u'abc', True)
323test('isalpha', u'aBc123', False)
324test('isalpha', u'abc\n', False)
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000325
Guido van Rossum77f6a652002-04-03 22:41:51 +0000326test('isalnum', u'a', True)
327test('isalnum', u'A', True)
328test('isalnum', u'\n', False)
329test('isalnum', u'123abc456', True)
330test('isalnum', u'a1b3c', True)
331test('isalnum', u'aBc000 ', False)
332test('isalnum', u'abc\n', False)
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000333
Guido van Rossuma831cac2000-03-10 23:23:21 +0000334test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
335test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
336test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
337test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
338test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
339test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
Guido van Rossum77f6a652002-04-03 22:41:51 +0000340test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], True)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000341
342test('translate', u"abababc", u'bbbc', {ord('a'):None})
343test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
344test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
345
Guido van Rossumd4d26842000-03-13 23:21:48 +0000346# Contains:
347print 'Testing Unicode contains method...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000348verify(('a' in u'abdb') == 1)
349verify(('a' in u'bdab') == 1)
350verify(('a' in u'bdaba') == 1)
351verify(('a' in u'bdba') == 1)
352verify(('a' in u'bdba') == 1)
353verify((u'a' in u'bdba') == 1)
354verify((u'a' in u'bdb') == 0)
355verify((u'a' in 'bdb') == 0)
356verify((u'a' in 'bdba') == 1)
357verify((u'a' in ('a',1,None)) == 1)
358verify((u'a' in (1,None,'a')) == 1)
359verify((u'a' in (1,None,u'a')) == 1)
360verify(('a' in ('a',1,None)) == 1)
361verify(('a' in (1,None,'a')) == 1)
362verify(('a' in (1,None,u'a')) == 1)
363verify(('a' in ('x',1,u'y')) == 0)
364verify(('a' in ('x',1,None)) == 0)
Guido van Rossumd4d26842000-03-13 23:21:48 +0000365print 'done.'
366
Guido van Rossuma831cac2000-03-10 23:23:21 +0000367# Formatting:
368print 'Testing Unicode formatting strings...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000369verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
370verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000, 3.00')
371verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000, 3.00')
372verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000, 3.50')
373verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000, 3.57')
374verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
375verify(u"%c" % (u"a",) == u'a')
376verify(u"%c" % ("a",) == u'a')
377verify(u"%c" % (34,) == u'"')
378verify(u"%c" % (36,) == u'$')
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000379if sys.platform[:4] != 'java':
380 value = u"%r, %r" % (u"abc", "abc")
381 if value != u"u'abc', 'abc'":
382 print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'
Marc-André Lemburg84625732000-06-13 12:05:36 +0000383
Marc-André Lemburg36619082001-01-17 19:11:13 +0000384verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000385try:
Marc-André Lemburg72f82132001-11-20 15:18:49 +0000386 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}
Marc-André Lemburg84625732000-06-13 12:05:36 +0000387except KeyError:
388 print '*** formatting failed for "%s"' % "u'abc, def'"
389else:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000390 verify(value == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000391
Guido van Rossum97064862000-04-10 13:52:48 +0000392# formatting jobs delegated from the string implementation:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000393verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
394verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
395verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
396verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
397verify('...%(foo)s...' % {u'foo':u"abc",'def':123} == u'...abc...')
398verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
399verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
400verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
401verify('...%s...' % u"abc" == u'...abc...')
Marc-André Lemburg542fe562001-05-02 14:21:53 +0000402verify('%*s' % (5,u'abc',) == u' abc')
403verify('%*s' % (-5,u'abc',) == u'abc ')
404verify('%*.*s' % (5,2,u'abc',) == u' ab')
405verify('%*.*s' % (5,3,u'abc',) == u' abc')
406verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10 abc')
407verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000408print 'done.'
409
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000410print 'Testing builtin unicode()...',
411
412# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
413
414verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
415
416class UnicodeSubclass(unicode):
417 pass
418
419verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
420 == u'unicode subclass becomes unicode')
421
422verify(unicode('strings are converted to unicode')
423 == u'strings are converted to unicode')
424
425class UnicodeCompat:
426 def __init__(self, x):
427 self.x = x
428 def __unicode__(self):
429 return self.x
430
431verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
432 == u'__unicode__ compatible objects are recognized')
433
434class StringCompat:
435 def __init__(self, x):
436 self.x = x
437 def __str__(self):
438 return self.x
439
440verify(unicode(StringCompat('__str__ compatible objects are recognized'))
441 == u'__str__ compatible objects are recognized')
442
443# unicode(obj) is compatible to str():
444
445o = StringCompat('unicode(obj) is compatible to str()')
446verify(unicode(o) == u'unicode(obj) is compatible to str()')
447verify(str(o) == 'unicode(obj) is compatible to str()')
448
449for obj in (123, 123.45, 123L):
450 verify(unicode(obj) == unicode(str(obj)))
451
452# unicode(obj, encoding, error) tests (this maps to
453# PyUnicode_FromEncodedObject() at C level)
454
Finn Bock2b29cb22001-12-10 20:57:34 +0000455if not sys.platform.startswith('java'):
456 try:
457 unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
458 except TypeError:
459 pass
460 else:
461 raise TestFailed, "decoding unicode should NOT be supported"
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000462
463verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
464 == u'strings are decoded to unicode')
465
Finn Bock2b29cb22001-12-10 20:57:34 +0000466if not sys.platform.startswith('java'):
467 verify(unicode(buffer('character buffers are decoded to unicode'),
468 'utf-8', 'strict')
469 == u'character buffers are decoded to unicode')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000470
471print 'done.'
472
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000473# Test builtin codecs
474print 'Testing builtin codecs...',
475
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000476# UTF-7 specific encoding tests:
477utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
478 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
479 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
480 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
481 (u'+', '+-'),
482 (u'+-', '+--'),
483 (u'+?', '+-?'),
484 (u'\?', '+AFw?'),
485 (u'+?', '+-?'),
486 (ur'\\?', '+AFwAXA?'),
487 (ur'\\\?', '+AFwAXABc?'),
488 (ur'++--', '+-+---')]
489
490for x,y in utfTests:
491 verify( x.encode('utf-7') == y )
492
Tim Peters527e64f2001-10-04 05:36:56 +0000493try:
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000494 unicode('+3ADYAA-', 'utf-7') # surrogates not supported
495except UnicodeError:
496 pass
497else:
498 raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
499
500verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
501
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000502# UTF-8 specific encoding tests:
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000503verify(u''.encode('utf-8') == '')
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000504verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
505verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
506verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
507verify(u'\ud800'.encode('utf-8') == '\xed\xa0\x80')
508verify(u'\udc00'.encode('utf-8') == '\xed\xb0\x80')
509verify((u'\ud800\udc02'*1000).encode('utf-8') ==
510 '\xf0\x90\x80\x82'*1000)
Marc-André Lemburgce0b6642002-04-10 17:18:02 +0000511verify(u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
512 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
513 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
514 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
515 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
516 u' Nunstuck git und'.encode('utf-8') ==
517 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
518 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
519 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
520 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
521 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
522 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
523 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
524 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
525 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
526 '\xe3\x80\x8cWenn ist das Nunstuck git und')
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000527
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000528# UTF-8 specific decoding tests
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000529verify(unicode('\xf0\xa3\x91\x96', 'utf-8') == u'\U00023456' )
530verify(unicode('\xf0\x90\x80\x82', 'utf-8') == u'\U00010002' )
531verify(unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac' )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000532
533# Other possible utf-8 test cases:
534# * strict decoding testing for all of the
535# UTF8_ERROR cases in PyUnicode_DecodeUTF8
536
Marc-André Lemburg36619082001-01-17 19:11:13 +0000537verify(unicode('hello','ascii') == u'hello')
538verify(unicode('hello','utf-8') == u'hello')
539verify(unicode('hello','utf8') == u'hello')
540verify(unicode('hello','latin-1') == u'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000541
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000542# Error handling
Guido van Rossum97064862000-04-10 13:52:48 +0000543try:
544 u'Andr\202 x'.encode('ascii')
545 u'Andr\202 x'.encode('ascii','strict')
546except ValueError:
547 pass
548else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000549 raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000550verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
551verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")
Guido van Rossum97064862000-04-10 13:52:48 +0000552
553try:
554 unicode('Andr\202 x','ascii')
555 unicode('Andr\202 x','ascii','strict')
556except ValueError:
557 pass
558else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000559 raise TestFailed, "unicode('Andr\202') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000560verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
561verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
Guido van Rossum97064862000-04-10 13:52:48 +0000562
Martin v. Löwis047c05e2002-03-21 08:55:28 +0000563verify("\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx")
564try:
565 "\\".decode("unicode-escape")
566except ValueError:
567 pass
568else:
569 raise TestFailed, '"\\".decode("unicode-escape") should fail'
570
Marc-André Lemburg36619082001-01-17 19:11:13 +0000571verify(u'hello'.encode('ascii') == 'hello')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000572verify(u'hello'.encode('utf-7') == 'hello')
Marc-André Lemburg36619082001-01-17 19:11:13 +0000573verify(u'hello'.encode('utf-8') == 'hello')
574verify(u'hello'.encode('utf8') == 'hello')
575verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
576verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
577verify(u'hello'.encode('latin-1') == 'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000578
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000579# Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000580u = u''.join(map(unichr, range(1024)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000581for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000582 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
Marc-André Lemburg36619082001-01-17 19:11:13 +0000583 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000584
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000585# Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000586u = u''.join(map(unichr, range(256)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000587for encoding in (
588 'latin-1',
589 ):
590 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000591 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000592 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000593 print '*** codec "%s" failed round-trip' % encoding
594 except ValueError,why:
595 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000596
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000597# Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000598u = u''.join(map(unichr, range(128)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000599for encoding in (
600 'ascii',
601 ):
602 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000603 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000604 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000605 print '*** codec "%s" failed round-trip' % encoding
606 except ValueError,why:
607 print '*** codec for "%s" failed: %s' % (encoding, why)
608
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000609# Roundtrip safety for non-BMP (just a few chars)
610u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
611for encoding in ('utf-8',
612 'utf-16', 'utf-16-le', 'utf-16-be',
613 #'raw_unicode_escape',
614 'unicode_escape', 'unicode_internal'):
615 verify(unicode(u.encode(encoding),encoding) == u)
616
617# UTF-8 must be roundtrip safe for all UCS-2 code points
618u = u''.join(map(unichr, range(0x10000)))
619for encoding in ('utf-8',):
620 verify(unicode(u.encode(encoding),encoding) == u)
621
Guido van Rossum9e896b32000-04-05 20:11:21 +0000622print 'done.'
623
624print 'Testing standard mapping codecs...',
625
626print '0-127...',
627s = ''.join(map(chr, range(128)))
628for encoding in (
629 'cp037', 'cp1026',
630 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
631 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000632 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000633 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
634 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
635 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
636 'mac_cyrillic', 'mac_latin2',
637
638 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
639 'cp1256', 'cp1257', 'cp1258',
640 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
641
642 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Tim Peters2f228e72001-05-13 00:19:31 +0000643 'cp1006', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000644
Guido van Rossum9e896b32000-04-05 20:11:21 +0000645 ### These have undefined mappings:
646 #'cp424',
Fred Drake004d5e62000-10-23 17:22:08 +0000647
Tim Peters2f228e72001-05-13 00:19:31 +0000648 ### These fail the round-trip:
649 #'cp875'
650
Guido van Rossum9e896b32000-04-05 20:11:21 +0000651 ):
652 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000653 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000654 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000655 print '*** codec "%s" failed round-trip' % encoding
656 except ValueError,why:
657 print '*** codec for "%s" failed: %s' % (encoding, why)
658
659print '128-255...',
660s = ''.join(map(chr, range(128,256)))
661for encoding in (
662 'cp037', 'cp1026',
663 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
664 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000665 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000666 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000667 'iso8859_2', 'iso8859_4', 'iso8859_5',
Marc-André Lemburga866df82001-01-03 21:29:14 +0000668 'iso8859_9', 'koi8_r', 'latin_1',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000669 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000670
Guido van Rossum9e896b32000-04-05 20:11:21 +0000671 ### These have undefined mappings:
672 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
673 #'cp1256', 'cp1257', 'cp1258',
674 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000675 #'iso8859_3', 'iso8859_6', 'iso8859_7',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000676 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000677
Guido van Rossum9e896b32000-04-05 20:11:21 +0000678 ### These fail the round-trip:
679 #'cp1006', 'cp875', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000680
Guido van Rossum9e896b32000-04-05 20:11:21 +0000681 ):
682 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000683 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000684 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000685 print '*** codec "%s" failed round-trip' % encoding
686 except ValueError,why:
687 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000688
689print 'done.'
Fred Drakee0243e22000-04-13 14:11:56 +0000690
691print 'Testing Unicode string concatenation...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000692verify((u"abc" u"def") == u"abcdef")
693verify(("abc" u"def") == u"abcdef")
694verify((u"abc" "def") == u"abcdef")
695verify((u"abc" u"def" "ghi") == u"abcdefghi")
696verify(("abc" "def" u"ghi") == u"abcdefghi")
Fred Drakee0243e22000-04-13 14:11:56 +0000697print 'done.'
Marc-André Lemburg0c4d8d02001-11-20 15:17:25 +0000698
699print 'Testing Unicode printing...',
700print u'abc'
701print u'abc', u'def'
702print u'abc', 'def'
703print 'abc', u'def'
704print u'abc\n'
705print u'abc\n',
706print u'abc\n',
707print u'def\n'
708print u'def\n'
709print 'done.'