blob: 56f18112623c13aeb394e03b5b994b8426a43b55 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Tim Peters2f228e72001-05-13 00:19:31 +00008from test_support import verify, verbose, TestFailed
Guido van Rossuma831cac2000-03-10 23:23:21 +00009import sys
10
Finn Bock2b29cb22001-12-10 20:57:34 +000011if not sys.platform.startswith('java'):
12 # Test basic sanity of repr()
13 verify(repr(u'abc') == "u'abc'")
14 verify(repr(u'ab\\c') == "u'ab\\\\c'")
15 verify(repr(u'ab\\') == "u'ab\\\\'")
16 verify(repr(u'\\c') == "u'\\\\c'")
17 verify(repr(u'\\') == "u'\\\\'")
18 verify(repr(u'\n') == "u'\\n'")
19 verify(repr(u'\r') == "u'\\r'")
20 verify(repr(u'\t') == "u'\\t'")
21 verify(repr(u'\b') == "u'\\x08'")
22 verify(repr(u"'\"") == """u'\\'"'""")
23 verify(repr(u"'\"") == """u'\\'"'""")
24 verify(repr(u"'") == '''u"'"''')
25 verify(repr(u'"') == """u'"'""")
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +000026 latin1repr = (
27 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
28 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
29 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
30 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
31 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
32 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
33 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
34 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
35 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
36 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
37 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
38 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
39 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
40 "\\xfe\\xff'")
41 testrepr = repr(u''.join(map(unichr, range(256))))
42 verify(testrepr == latin1repr)
Guido van Rossume4874ae2001-09-21 15:36:41 +000043
Guido van Rossuma831cac2000-03-10 23:23:21 +000044def test(method, input, output, *args):
45 if verbose:
Guido van Rossum15ffc712000-11-29 12:13:59 +000046 print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
Guido van Rossuma831cac2000-03-10 23:23:21 +000047 try:
48 f = getattr(input, method)
49 value = apply(f, args)
50 except:
51 value = sys.exc_type
Guido van Rossum66503202000-04-28 20:39:58 +000052 exc = sys.exc_info()[:2]
Guido van Rossuma831cac2000-03-10 23:23:21 +000053 else:
54 exc = None
Guido van Rossum15ffc712000-11-29 12:13:59 +000055 if value != output or type(value) is not type(output):
Guido van Rossuma831cac2000-03-10 23:23:21 +000056 if verbose:
57 print 'no'
58 print '*',f, `input`, `output`, `value`
59 if exc:
Guido van Rossum66503202000-04-28 20:39:58 +000060 print ' value == %s: %s' % (exc)
Guido van Rossuma831cac2000-03-10 23:23:21 +000061 else:
62 if verbose:
63 print 'yes'
64
65test('capitalize', u' hello ', u' hello ')
66test('capitalize', u'hello ', u'Hello ')
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000067test('capitalize', u'aaaa', u'Aaaa')
68test('capitalize', u'AaAa', u'Aaaa')
Guido van Rossuma831cac2000-03-10 23:23:21 +000069
Marc-André Lemburg3a645e42001-01-16 11:54:12 +000070test('count', u'aaa', 3, u'a')
71test('count', u'aaa', 0, u'b')
72test('count', 'aaa', 3, u'a')
73test('count', 'aaa', 0, u'b')
74test('count', u'aaa', 3, 'a')
75test('count', u'aaa', 0, 'b')
76
Guido van Rossuma831cac2000-03-10 23:23:21 +000077test('title', u' hello ', u' Hello ')
78test('title', u'hello ', u'Hello ')
79test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
80test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
81test('title', u"getInt", u'Getint')
82
83test('find', u'abcdefghiabc', 0, u'abc')
84test('find', u'abcdefghiabc', 9, u'abc', 1)
85test('find', u'abcdefghiabc', -1, u'def', 4)
86
87test('rfind', u'abcdefghiabc', 9, u'abc')
88
89test('lower', u'HeLLo', u'hello')
90test('lower', u'hello', u'hello')
91
92test('upper', u'HeLLo', u'HELLO')
93test('upper', u'HELLO', u'HELLO')
94
95if 0:
96 transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
97
98 test('maketrans', u'abc', transtable, u'xyz')
99 test('maketrans', u'abc', ValueError, u'xyzq')
100
101test('split', u'this is the split function',
102 [u'this', u'is', u'the', u'split', u'function'])
103test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
104test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
105test('split', u'a b c d', [u'a', u'b c d'], None, 1)
106test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
107test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
108test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
109test('split', u'a b c d', [u'a b c d'], None, 0)
110test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
111test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
Guido van Rossum8b264542000-12-19 02:22:31 +0000112test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
113test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
114test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
115test('split', u'endcase test', [u'endcase ', u''], u'test')
116test('split', u'endcase test', [u'endcase ', u''], 'test')
117test('split', 'endcase test', [u'endcase ', u''], u'test')
118
Guido van Rossuma831cac2000-03-10 23:23:21 +0000119
120# join now works with any sequence type
121class Sequence:
Guido van Rossum15ffc712000-11-29 12:13:59 +0000122 def __init__(self, seq): self.seq = seq
Guido van Rossuma831cac2000-03-10 23:23:21 +0000123 def __len__(self): return len(self.seq)
124 def __getitem__(self, i): return self.seq[i]
125
126test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
Guido van Rossum15ffc712000-11-29 12:13:59 +0000127test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
Guido van Rossuma831cac2000-03-10 23:23:21 +0000128test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
Guido van Rossum15ffc712000-11-29 12:13:59 +0000129test('join', u' ', u'w x y z', Sequence('wxyz'))
Guido van Rossuma831cac2000-03-10 23:23:21 +0000130test('join', u' ', TypeError, 7)
Guido van Rossum15ffc712000-11-29 12:13:59 +0000131test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
132test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
133test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
134test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
135test('join', ' ', u'w x y z', Sequence(u'wxyz'))
136test('join', ' ', TypeError, 7)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000137
138result = u''
139for i in range(10):
140 if i > 0:
141 result = result + u':'
142 result = result + u'x'*10
143test('join', u':', result, [u'x' * 10] * 10)
144test('join', u':', result, (u'x' * 10,) * 10)
145
146test('strip', u' hello ', u'hello')
147test('lstrip', u' hello ', u'hello ')
148test('rstrip', u' hello ', u' hello')
149test('strip', u'hello', u'hello')
150
151test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
152
153if 0:
154 test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
155
156 table = string.maketrans('a', u'A')
157 test('translate', u'abc', u'Abc', table)
158 test('translate', u'xyz', u'xyz', table)
159
160test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
Barry Warsaw51ac5802000-03-20 16:36:48 +0000161test('replace', u'one!two!three!', u'onetwothree', '!', '')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000162test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
163test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
164test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
165test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
166test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
167test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
168test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
169
170test('startswith', u'hello', 1, u'he')
171test('startswith', u'hello', 1, u'hello')
172test('startswith', u'hello', 0, u'hello world')
173test('startswith', u'hello', 1, u'')
174test('startswith', u'hello', 0, u'ello')
175test('startswith', u'hello', 1, u'ello', 1)
176test('startswith', u'hello', 1, u'o', 4)
177test('startswith', u'hello', 0, u'o', 5)
178test('startswith', u'hello', 1, u'', 5)
179test('startswith', u'hello', 0, u'lo', 6)
180test('startswith', u'helloworld', 1, u'lowo', 3)
181test('startswith', u'helloworld', 1, u'lowo', 3, 7)
182test('startswith', u'helloworld', 0, u'lowo', 3, 6)
183
184test('endswith', u'hello', 1, u'lo')
185test('endswith', u'hello', 0, u'he')
186test('endswith', u'hello', 1, u'')
187test('endswith', u'hello', 0, u'hello world')
188test('endswith', u'helloworld', 0, u'worl')
189test('endswith', u'helloworld', 1, u'worl', 3, 9)
190test('endswith', u'helloworld', 1, u'world', 3, 12)
191test('endswith', u'helloworld', 1, u'lowo', 1, 7)
192test('endswith', u'helloworld', 1, u'lowo', 2, 7)
193test('endswith', u'helloworld', 1, u'lowo', 3, 7)
194test('endswith', u'helloworld', 0, u'lowo', 4, 7)
195test('endswith', u'helloworld', 0, u'lowo', 3, 8)
196test('endswith', u'ab', 0, u'ab', 0, 1)
197test('endswith', u'ab', 0, u'ab', 0, 0)
198
199test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi')
200test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 8)
201test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 4)
202test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab def\ng hi', 4)
203
204if 0:
205 test('capwords', u'abc def ghi', u'Abc Def Ghi')
206 test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
207 test('capwords', u'abc\t def \nghi', u'Abc Def Ghi')
208
209# Comparisons:
210print 'Testing Unicode comparisons...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000211verify(u'abc' == 'abc')
212verify('abc' == u'abc')
213verify(u'abc' == u'abc')
214verify(u'abcd' > 'abc')
215verify('abcd' > u'abc')
216verify(u'abcd' > u'abc')
217verify(u'abc' < 'abcd')
218verify('abc' < u'abcd')
219verify(u'abc' < u'abcd')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000220print 'done.'
221
Marc-André Lemburge5034372000-08-08 08:04:29 +0000222if 0:
223 # Move these tests to a Unicode collation module test...
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000224
Marc-André Lemburge5034372000-08-08 08:04:29 +0000225 print 'Testing UTF-16 code point order comparisons...',
226 #No surrogates, no fixup required.
Marc-André Lemburg36619082001-01-17 19:11:13 +0000227 verify(u'\u0061' < u'\u20ac')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000228 # Non surrogate below surrogate value, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000229 verify(u'\u0061' < u'\ud800\udc02')
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000230
Marc-André Lemburge5034372000-08-08 08:04:29 +0000231 # Non surrogate above surrogate value, fixup required
232 def test_lecmp(s, s2):
Tim Petersd2bf3b72001-01-18 02:22:22 +0000233 verify(s < s2 , "comparison failed on %s < %s" % (s, s2))
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000234
Marc-André Lemburge5034372000-08-08 08:04:29 +0000235 def test_fixup(s):
Fred Drake004d5e62000-10-23 17:22:08 +0000236 s2 = u'\ud800\udc01'
237 test_lecmp(s, s2)
238 s2 = u'\ud900\udc01'
239 test_lecmp(s, s2)
240 s2 = u'\uda00\udc01'
241 test_lecmp(s, s2)
242 s2 = u'\udb00\udc01'
243 test_lecmp(s, s2)
244 s2 = u'\ud800\udd01'
245 test_lecmp(s, s2)
246 s2 = u'\ud900\udd01'
247 test_lecmp(s, s2)
248 s2 = u'\uda00\udd01'
249 test_lecmp(s, s2)
250 s2 = u'\udb00\udd01'
251 test_lecmp(s, s2)
252 s2 = u'\ud800\ude01'
253 test_lecmp(s, s2)
254 s2 = u'\ud900\ude01'
255 test_lecmp(s, s2)
256 s2 = u'\uda00\ude01'
257 test_lecmp(s, s2)
258 s2 = u'\udb00\ude01'
259 test_lecmp(s, s2)
260 s2 = u'\ud800\udfff'
261 test_lecmp(s, s2)
262 s2 = u'\ud900\udfff'
263 test_lecmp(s, s2)
264 s2 = u'\uda00\udfff'
265 test_lecmp(s, s2)
266 s2 = u'\udb00\udfff'
267 test_lecmp(s, s2)
Marc-André Lemburge5034372000-08-08 08:04:29 +0000268
269 test_fixup(u'\ue000')
270 test_fixup(u'\uff61')
271
272 # Surrogates on both sides, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000273 verify(u'\ud800\udc02' < u'\ud84d\udc56')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000274 print 'done.'
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000275
Guido van Rossuma831cac2000-03-10 23:23:21 +0000276test('ljust', u'abc', u'abc ', 10)
277test('rjust', u'abc', u' abc', 10)
278test('center', u'abc', u' abc ', 10)
279test('ljust', u'abc', u'abc ', 6)
280test('rjust', u'abc', u' abc', 6)
281test('center', u'abc', u' abc ', 6)
282test('ljust', u'abc', u'abc', 2)
283test('rjust', u'abc', u'abc', 2)
284test('center', u'abc', u'abc', 2)
285
286test('islower', u'a', 1)
287test('islower', u'A', 0)
288test('islower', u'\n', 0)
289test('islower', u'\u1FFc', 0)
290test('islower', u'abc', 1)
291test('islower', u'aBc', 0)
292test('islower', u'abc\n', 1)
293
294test('isupper', u'a', 0)
295test('isupper', u'A', 1)
296test('isupper', u'\n', 0)
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000297if sys.platform[:4] != 'java':
298 test('isupper', u'\u1FFc', 0)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000299test('isupper', u'ABC', 1)
300test('isupper', u'AbC', 0)
301test('isupper', u'ABC\n', 1)
302
303test('istitle', u'a', 0)
304test('istitle', u'A', 1)
305test('istitle', u'\n', 0)
306test('istitle', u'\u1FFc', 1)
307test('istitle', u'A Titlecased Line', 1)
308test('istitle', u'A\nTitlecased Line', 1)
309test('istitle', u'A Titlecased, Line', 1)
310test('istitle', u'Greek \u1FFcitlecases ...', 1)
311test('istitle', u'Not a capitalized String', 0)
312test('istitle', u'Not\ta Titlecase String', 0)
313test('istitle', u'Not--a Titlecase String', 0)
314
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000315test('isalpha', u'a', 1)
316test('isalpha', u'A', 1)
317test('isalpha', u'\n', 0)
318test('isalpha', u'\u1FFc', 1)
319test('isalpha', u'abc', 1)
320test('isalpha', u'aBc123', 0)
321test('isalpha', u'abc\n', 0)
322
323test('isalnum', u'a', 1)
324test('isalnum', u'A', 1)
325test('isalnum', u'\n', 0)
326test('isalnum', u'123abc456', 1)
327test('isalnum', u'a1b3c', 1)
328test('isalnum', u'aBc000 ', 0)
329test('isalnum', u'abc\n', 0)
330
Guido van Rossuma831cac2000-03-10 23:23:21 +0000331test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
332test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
333test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
334test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
335test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
336test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
Guido van Rossum7ee801d2000-04-11 15:37:02 +0000337test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], 1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000338
339test('translate', u"abababc", u'bbbc', {ord('a'):None})
340test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
341test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
342
Guido van Rossumd4d26842000-03-13 23:21:48 +0000343# Contains:
344print 'Testing Unicode contains method...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000345verify(('a' in u'abdb') == 1)
346verify(('a' in u'bdab') == 1)
347verify(('a' in u'bdaba') == 1)
348verify(('a' in u'bdba') == 1)
349verify(('a' in u'bdba') == 1)
350verify((u'a' in u'bdba') == 1)
351verify((u'a' in u'bdb') == 0)
352verify((u'a' in 'bdb') == 0)
353verify((u'a' in 'bdba') == 1)
354verify((u'a' in ('a',1,None)) == 1)
355verify((u'a' in (1,None,'a')) == 1)
356verify((u'a' in (1,None,u'a')) == 1)
357verify(('a' in ('a',1,None)) == 1)
358verify(('a' in (1,None,'a')) == 1)
359verify(('a' in (1,None,u'a')) == 1)
360verify(('a' in ('x',1,u'y')) == 0)
361verify(('a' in ('x',1,None)) == 0)
Guido van Rossumd4d26842000-03-13 23:21:48 +0000362print 'done.'
363
Guido van Rossuma831cac2000-03-10 23:23:21 +0000364# Formatting:
365print 'Testing Unicode formatting strings...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000366verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
367verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000, 3.00')
368verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000, 3.00')
369verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000, 3.50')
370verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000, 3.57')
371verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
372verify(u"%c" % (u"a",) == u'a')
373verify(u"%c" % ("a",) == u'a')
374verify(u"%c" % (34,) == u'"')
375verify(u"%c" % (36,) == u'$')
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000376if sys.platform[:4] != 'java':
377 value = u"%r, %r" % (u"abc", "abc")
378 if value != u"u'abc', 'abc'":
379 print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'
Marc-André Lemburg84625732000-06-13 12:05:36 +0000380
Marc-André Lemburg36619082001-01-17 19:11:13 +0000381verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000382try:
Marc-André Lemburg72f82132001-11-20 15:18:49 +0000383 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}
Marc-André Lemburg84625732000-06-13 12:05:36 +0000384except KeyError:
385 print '*** formatting failed for "%s"' % "u'abc, def'"
386else:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000387 verify(value == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000388
Guido van Rossum97064862000-04-10 13:52:48 +0000389# formatting jobs delegated from the string implementation:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000390verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
391verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
392verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
393verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
394verify('...%(foo)s...' % {u'foo':u"abc",'def':123} == u'...abc...')
395verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
396verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
397verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
398verify('...%s...' % u"abc" == u'...abc...')
Marc-André Lemburg542fe562001-05-02 14:21:53 +0000399verify('%*s' % (5,u'abc',) == u' abc')
400verify('%*s' % (-5,u'abc',) == u'abc ')
401verify('%*.*s' % (5,2,u'abc',) == u' ab')
402verify('%*.*s' % (5,3,u'abc',) == u' abc')
403verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10 abc')
404verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000405print 'done.'
406
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000407print 'Testing builtin unicode()...',
408
409# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
410
411verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
412
413class UnicodeSubclass(unicode):
414 pass
415
416verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
417 == u'unicode subclass becomes unicode')
418
419verify(unicode('strings are converted to unicode')
420 == u'strings are converted to unicode')
421
422class UnicodeCompat:
423 def __init__(self, x):
424 self.x = x
425 def __unicode__(self):
426 return self.x
427
428verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
429 == u'__unicode__ compatible objects are recognized')
430
431class StringCompat:
432 def __init__(self, x):
433 self.x = x
434 def __str__(self):
435 return self.x
436
437verify(unicode(StringCompat('__str__ compatible objects are recognized'))
438 == u'__str__ compatible objects are recognized')
439
440# unicode(obj) is compatible to str():
441
442o = StringCompat('unicode(obj) is compatible to str()')
443verify(unicode(o) == u'unicode(obj) is compatible to str()')
444verify(str(o) == 'unicode(obj) is compatible to str()')
445
446for obj in (123, 123.45, 123L):
447 verify(unicode(obj) == unicode(str(obj)))
448
449# unicode(obj, encoding, error) tests (this maps to
450# PyUnicode_FromEncodedObject() at C level)
451
Finn Bock2b29cb22001-12-10 20:57:34 +0000452if not sys.platform.startswith('java'):
453 try:
454 unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
455 except TypeError:
456 pass
457 else:
458 raise TestFailed, "decoding unicode should NOT be supported"
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000459
460verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
461 == u'strings are decoded to unicode')
462
Finn Bock2b29cb22001-12-10 20:57:34 +0000463if not sys.platform.startswith('java'):
464 verify(unicode(buffer('character buffers are decoded to unicode'),
465 'utf-8', 'strict')
466 == u'character buffers are decoded to unicode')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000467
468print 'done.'
469
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000470# Test builtin codecs
471print 'Testing builtin codecs...',
472
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000473# UTF-7 specific encoding tests:
474utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
475 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
476 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
477 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
478 (u'+', '+-'),
479 (u'+-', '+--'),
480 (u'+?', '+-?'),
481 (u'\?', '+AFw?'),
482 (u'+?', '+-?'),
483 (ur'\\?', '+AFwAXA?'),
484 (ur'\\\?', '+AFwAXABc?'),
485 (ur'++--', '+-+---')]
486
487for x,y in utfTests:
488 verify( x.encode('utf-7') == y )
489
Tim Peters527e64f2001-10-04 05:36:56 +0000490try:
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000491 unicode('+3ADYAA-', 'utf-7') # surrogates not supported
492except UnicodeError:
493 pass
494else:
495 raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
496
497verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
498
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000499# UTF-8 specific encoding tests:
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000500verify(u''.encode('utf-8') == '')
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000501verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
502verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
503verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
504verify(u'\ud800'.encode('utf-8') == '\xed\xa0\x80')
505verify(u'\udc00'.encode('utf-8') == '\xed\xb0\x80')
506verify((u'\ud800\udc02'*1000).encode('utf-8') ==
507 '\xf0\x90\x80\x82'*1000)
508
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000509# UTF-8 specific decoding tests
Marc-André Lemburg3688a882002-02-06 18:09:02 +0000510verify(unicode('\xf0\xa3\x91\x96', 'utf-8') == u'\U00023456' )
511verify(unicode('\xf0\x90\x80\x82', 'utf-8') == u'\U00010002' )
512verify(unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac' )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000513
514# Other possible utf-8 test cases:
515# * strict decoding testing for all of the
516# UTF8_ERROR cases in PyUnicode_DecodeUTF8
517
Marc-André Lemburg36619082001-01-17 19:11:13 +0000518verify(unicode('hello','ascii') == u'hello')
519verify(unicode('hello','utf-8') == u'hello')
520verify(unicode('hello','utf8') == u'hello')
521verify(unicode('hello','latin-1') == u'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000522
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000523# Error handling
Guido van Rossum97064862000-04-10 13:52:48 +0000524try:
525 u'Andr\202 x'.encode('ascii')
526 u'Andr\202 x'.encode('ascii','strict')
527except ValueError:
528 pass
529else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000530 raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000531verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
532verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")
Guido van Rossum97064862000-04-10 13:52:48 +0000533
534try:
535 unicode('Andr\202 x','ascii')
536 unicode('Andr\202 x','ascii','strict')
537except ValueError:
538 pass
539else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000540 raise TestFailed, "unicode('Andr\202') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000541verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
542verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
Guido van Rossum97064862000-04-10 13:52:48 +0000543
Marc-André Lemburg36619082001-01-17 19:11:13 +0000544verify(u'hello'.encode('ascii') == 'hello')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000545verify(u'hello'.encode('utf-7') == 'hello')
Marc-André Lemburg36619082001-01-17 19:11:13 +0000546verify(u'hello'.encode('utf-8') == 'hello')
547verify(u'hello'.encode('utf8') == 'hello')
548verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
549verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
550verify(u'hello'.encode('latin-1') == 'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000551
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000552# Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000553u = u''.join(map(unichr, range(1024)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000554for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000555 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
Marc-André Lemburg36619082001-01-17 19:11:13 +0000556 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000557
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000558# Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000559u = u''.join(map(unichr, range(256)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000560for encoding in (
561 'latin-1',
562 ):
563 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000564 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000565 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000566 print '*** codec "%s" failed round-trip' % encoding
567 except ValueError,why:
568 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000569
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000570# Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000571u = u''.join(map(unichr, range(128)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000572for encoding in (
573 'ascii',
574 ):
575 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000576 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000577 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000578 print '*** codec "%s" failed round-trip' % encoding
579 except ValueError,why:
580 print '*** codec for "%s" failed: %s' % (encoding, why)
581
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +0000582# Roundtrip safety for non-BMP (just a few chars)
583u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
584for encoding in ('utf-8',
585 'utf-16', 'utf-16-le', 'utf-16-be',
586 #'raw_unicode_escape',
587 'unicode_escape', 'unicode_internal'):
588 verify(unicode(u.encode(encoding),encoding) == u)
589
590# UTF-8 must be roundtrip safe for all UCS-2 code points
591u = u''.join(map(unichr, range(0x10000)))
592for encoding in ('utf-8',):
593 verify(unicode(u.encode(encoding),encoding) == u)
594
Guido van Rossum9e896b32000-04-05 20:11:21 +0000595print 'done.'
596
597print 'Testing standard mapping codecs...',
598
599print '0-127...',
600s = ''.join(map(chr, range(128)))
601for encoding in (
602 'cp037', 'cp1026',
603 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
604 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000605 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000606 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
607 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
608 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
609 'mac_cyrillic', 'mac_latin2',
610
611 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
612 'cp1256', 'cp1257', 'cp1258',
613 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
614
615 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Tim Peters2f228e72001-05-13 00:19:31 +0000616 'cp1006', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000617
Guido van Rossum9e896b32000-04-05 20:11:21 +0000618 ### These have undefined mappings:
619 #'cp424',
Fred Drake004d5e62000-10-23 17:22:08 +0000620
Tim Peters2f228e72001-05-13 00:19:31 +0000621 ### These fail the round-trip:
622 #'cp875'
623
Guido van Rossum9e896b32000-04-05 20:11:21 +0000624 ):
625 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000626 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000627 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000628 print '*** codec "%s" failed round-trip' % encoding
629 except ValueError,why:
630 print '*** codec for "%s" failed: %s' % (encoding, why)
631
632print '128-255...',
633s = ''.join(map(chr, range(128,256)))
634for encoding in (
635 'cp037', 'cp1026',
636 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
637 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000638 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000639 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000640 'iso8859_2', 'iso8859_4', 'iso8859_5',
Marc-André Lemburga866df82001-01-03 21:29:14 +0000641 'iso8859_9', 'koi8_r', 'latin_1',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000642 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000643
Guido van Rossum9e896b32000-04-05 20:11:21 +0000644 ### These have undefined mappings:
645 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
646 #'cp1256', 'cp1257', 'cp1258',
647 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000648 #'iso8859_3', 'iso8859_6', 'iso8859_7',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000649 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000650
Guido van Rossum9e896b32000-04-05 20:11:21 +0000651 ### These fail the round-trip:
652 #'cp1006', 'cp875', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000653
Guido van Rossum9e896b32000-04-05 20:11:21 +0000654 ):
655 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000656 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000657 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000658 print '*** codec "%s" failed round-trip' % encoding
659 except ValueError,why:
660 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000661
662print 'done.'
Fred Drakee0243e22000-04-13 14:11:56 +0000663
664print 'Testing Unicode string concatenation...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000665verify((u"abc" u"def") == u"abcdef")
666verify(("abc" u"def") == u"abcdef")
667verify((u"abc" "def") == u"abcdef")
668verify((u"abc" u"def" "ghi") == u"abcdefghi")
669verify(("abc" "def" u"ghi") == u"abcdefghi")
Fred Drakee0243e22000-04-13 14:11:56 +0000670print 'done.'
Marc-André Lemburg0c4d8d02001-11-20 15:17:25 +0000671
672print 'Testing Unicode printing...',
673print u'abc'
674print u'abc', u'def'
675print u'abc', 'def'
676print 'abc', u'def'
677print u'abc\n'
678print u'abc\n',
679print u'abc\n',
680print u'def\n'
681print u'def\n'
682print 'done.'