blob: eff11cfa6a4e218252f48e8126605514aca36982 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Tim Peters2f228e72001-05-13 00:19:31 +00008from test_support import verify, verbose, TestFailed
Guido van Rossuma831cac2000-03-10 23:23:21 +00009import sys
10
Guido van Rossume4874ae2001-09-21 15:36:41 +000011# Test basic sanity of repr()
12verify(repr(u'abc') == "u'abc'")
13verify(repr(u'ab\\c') == "u'ab\\\\c'")
14verify(repr(u'ab\\') == "u'ab\\\\'")
15verify(repr(u'\\c') == "u'\\\\c'")
16verify(repr(u'\\') == "u'\\\\'")
17verify(repr(u'\n') == "u'\\n'")
18verify(repr(u'\r') == "u'\\r'")
19verify(repr(u'\t') == "u'\\t'")
20verify(repr(u'\b') == "u'\\x08'")
Guido van Rossum11310bf2001-09-21 15:46:41 +000021verify(repr(u"'\"") == """u'\\'"'""")
22verify(repr(u"'\"") == """u'\\'"'""")
23verify(repr(u"'") == '''u"'"''')
24verify(repr(u'"') == """u'"'""")
Guido van Rossume4874ae2001-09-21 15:36:41 +000025
Guido van Rossuma831cac2000-03-10 23:23:21 +000026def test(method, input, output, *args):
27 if verbose:
Guido van Rossum15ffc712000-11-29 12:13:59 +000028 print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
Guido van Rossuma831cac2000-03-10 23:23:21 +000029 try:
30 f = getattr(input, method)
31 value = apply(f, args)
32 except:
33 value = sys.exc_type
Guido van Rossum66503202000-04-28 20:39:58 +000034 exc = sys.exc_info()[:2]
Guido van Rossuma831cac2000-03-10 23:23:21 +000035 else:
36 exc = None
Guido van Rossum15ffc712000-11-29 12:13:59 +000037 if value != output or type(value) is not type(output):
Guido van Rossuma831cac2000-03-10 23:23:21 +000038 if verbose:
39 print 'no'
40 print '*',f, `input`, `output`, `value`
41 if exc:
Guido van Rossum66503202000-04-28 20:39:58 +000042 print ' value == %s: %s' % (exc)
Guido van Rossuma831cac2000-03-10 23:23:21 +000043 else:
44 if verbose:
45 print 'yes'
46
47test('capitalize', u' hello ', u' hello ')
48test('capitalize', u'hello ', u'Hello ')
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000049test('capitalize', u'aaaa', u'Aaaa')
50test('capitalize', u'AaAa', u'Aaaa')
Guido van Rossuma831cac2000-03-10 23:23:21 +000051
Marc-André Lemburg3a645e42001-01-16 11:54:12 +000052test('count', u'aaa', 3, u'a')
53test('count', u'aaa', 0, u'b')
54test('count', 'aaa', 3, u'a')
55test('count', 'aaa', 0, u'b')
56test('count', u'aaa', 3, 'a')
57test('count', u'aaa', 0, 'b')
58
Guido van Rossuma831cac2000-03-10 23:23:21 +000059test('title', u' hello ', u' Hello ')
60test('title', u'hello ', u'Hello ')
61test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
62test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
63test('title', u"getInt", u'Getint')
64
65test('find', u'abcdefghiabc', 0, u'abc')
66test('find', u'abcdefghiabc', 9, u'abc', 1)
67test('find', u'abcdefghiabc', -1, u'def', 4)
68
69test('rfind', u'abcdefghiabc', 9, u'abc')
70
71test('lower', u'HeLLo', u'hello')
72test('lower', u'hello', u'hello')
73
74test('upper', u'HeLLo', u'HELLO')
75test('upper', u'HELLO', u'HELLO')
76
77if 0:
78 transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
79
80 test('maketrans', u'abc', transtable, u'xyz')
81 test('maketrans', u'abc', ValueError, u'xyzq')
82
83test('split', u'this is the split function',
84 [u'this', u'is', u'the', u'split', u'function'])
85test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
86test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
87test('split', u'a b c d', [u'a', u'b c d'], None, 1)
88test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
89test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
90test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
91test('split', u'a b c d', [u'a b c d'], None, 0)
92test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
93test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
Guido van Rossum8b264542000-12-19 02:22:31 +000094test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
95test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
96test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
97test('split', u'endcase test', [u'endcase ', u''], u'test')
98test('split', u'endcase test', [u'endcase ', u''], 'test')
99test('split', 'endcase test', [u'endcase ', u''], u'test')
100
Guido van Rossuma831cac2000-03-10 23:23:21 +0000101
102# join now works with any sequence type
103class Sequence:
Guido van Rossum15ffc712000-11-29 12:13:59 +0000104 def __init__(self, seq): self.seq = seq
Guido van Rossuma831cac2000-03-10 23:23:21 +0000105 def __len__(self): return len(self.seq)
106 def __getitem__(self, i): return self.seq[i]
107
108test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
Guido van Rossum15ffc712000-11-29 12:13:59 +0000109test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
Guido van Rossuma831cac2000-03-10 23:23:21 +0000110test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
Guido van Rossum15ffc712000-11-29 12:13:59 +0000111test('join', u' ', u'w x y z', Sequence('wxyz'))
Guido van Rossuma831cac2000-03-10 23:23:21 +0000112test('join', u' ', TypeError, 7)
Guido van Rossum15ffc712000-11-29 12:13:59 +0000113test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
114test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
115test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
116test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
117test('join', ' ', u'w x y z', Sequence(u'wxyz'))
118test('join', ' ', TypeError, 7)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000119
120result = u''
121for i in range(10):
122 if i > 0:
123 result = result + u':'
124 result = result + u'x'*10
125test('join', u':', result, [u'x' * 10] * 10)
126test('join', u':', result, (u'x' * 10,) * 10)
127
128test('strip', u' hello ', u'hello')
129test('lstrip', u' hello ', u'hello ')
130test('rstrip', u' hello ', u' hello')
131test('strip', u'hello', u'hello')
132
133test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
134
135if 0:
136 test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
137
138 table = string.maketrans('a', u'A')
139 test('translate', u'abc', u'Abc', table)
140 test('translate', u'xyz', u'xyz', table)
141
142test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
Barry Warsaw51ac5802000-03-20 16:36:48 +0000143test('replace', u'one!two!three!', u'onetwothree', '!', '')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000144test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
145test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
146test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
147test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
148test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
149test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
150test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
151
152test('startswith', u'hello', 1, u'he')
153test('startswith', u'hello', 1, u'hello')
154test('startswith', u'hello', 0, u'hello world')
155test('startswith', u'hello', 1, u'')
156test('startswith', u'hello', 0, u'ello')
157test('startswith', u'hello', 1, u'ello', 1)
158test('startswith', u'hello', 1, u'o', 4)
159test('startswith', u'hello', 0, u'o', 5)
160test('startswith', u'hello', 1, u'', 5)
161test('startswith', u'hello', 0, u'lo', 6)
162test('startswith', u'helloworld', 1, u'lowo', 3)
163test('startswith', u'helloworld', 1, u'lowo', 3, 7)
164test('startswith', u'helloworld', 0, u'lowo', 3, 6)
165
166test('endswith', u'hello', 1, u'lo')
167test('endswith', u'hello', 0, u'he')
168test('endswith', u'hello', 1, u'')
169test('endswith', u'hello', 0, u'hello world')
170test('endswith', u'helloworld', 0, u'worl')
171test('endswith', u'helloworld', 1, u'worl', 3, 9)
172test('endswith', u'helloworld', 1, u'world', 3, 12)
173test('endswith', u'helloworld', 1, u'lowo', 1, 7)
174test('endswith', u'helloworld', 1, u'lowo', 2, 7)
175test('endswith', u'helloworld', 1, u'lowo', 3, 7)
176test('endswith', u'helloworld', 0, u'lowo', 4, 7)
177test('endswith', u'helloworld', 0, u'lowo', 3, 8)
178test('endswith', u'ab', 0, u'ab', 0, 1)
179test('endswith', u'ab', 0, u'ab', 0, 0)
180
181test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi')
182test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 8)
183test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 4)
184test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab def\ng hi', 4)
185
186if 0:
187 test('capwords', u'abc def ghi', u'Abc Def Ghi')
188 test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
189 test('capwords', u'abc\t def \nghi', u'Abc Def Ghi')
190
191# Comparisons:
192print 'Testing Unicode comparisons...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000193verify(u'abc' == 'abc')
194verify('abc' == u'abc')
195verify(u'abc' == u'abc')
196verify(u'abcd' > 'abc')
197verify('abcd' > u'abc')
198verify(u'abcd' > u'abc')
199verify(u'abc' < 'abcd')
200verify('abc' < u'abcd')
201verify(u'abc' < u'abcd')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000202print 'done.'
203
Marc-André Lemburge5034372000-08-08 08:04:29 +0000204if 0:
205 # Move these tests to a Unicode collation module test...
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000206
Marc-André Lemburge5034372000-08-08 08:04:29 +0000207 print 'Testing UTF-16 code point order comparisons...',
208 #No surrogates, no fixup required.
Marc-André Lemburg36619082001-01-17 19:11:13 +0000209 verify(u'\u0061' < u'\u20ac')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000210 # Non surrogate below surrogate value, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000211 verify(u'\u0061' < u'\ud800\udc02')
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000212
Marc-André Lemburge5034372000-08-08 08:04:29 +0000213 # Non surrogate above surrogate value, fixup required
214 def test_lecmp(s, s2):
Tim Petersd2bf3b72001-01-18 02:22:22 +0000215 verify(s < s2 , "comparison failed on %s < %s" % (s, s2))
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000216
Marc-André Lemburge5034372000-08-08 08:04:29 +0000217 def test_fixup(s):
Fred Drake004d5e62000-10-23 17:22:08 +0000218 s2 = u'\ud800\udc01'
219 test_lecmp(s, s2)
220 s2 = u'\ud900\udc01'
221 test_lecmp(s, s2)
222 s2 = u'\uda00\udc01'
223 test_lecmp(s, s2)
224 s2 = u'\udb00\udc01'
225 test_lecmp(s, s2)
226 s2 = u'\ud800\udd01'
227 test_lecmp(s, s2)
228 s2 = u'\ud900\udd01'
229 test_lecmp(s, s2)
230 s2 = u'\uda00\udd01'
231 test_lecmp(s, s2)
232 s2 = u'\udb00\udd01'
233 test_lecmp(s, s2)
234 s2 = u'\ud800\ude01'
235 test_lecmp(s, s2)
236 s2 = u'\ud900\ude01'
237 test_lecmp(s, s2)
238 s2 = u'\uda00\ude01'
239 test_lecmp(s, s2)
240 s2 = u'\udb00\ude01'
241 test_lecmp(s, s2)
242 s2 = u'\ud800\udfff'
243 test_lecmp(s, s2)
244 s2 = u'\ud900\udfff'
245 test_lecmp(s, s2)
246 s2 = u'\uda00\udfff'
247 test_lecmp(s, s2)
248 s2 = u'\udb00\udfff'
249 test_lecmp(s, s2)
Marc-André Lemburge5034372000-08-08 08:04:29 +0000250
251 test_fixup(u'\ue000')
252 test_fixup(u'\uff61')
253
254 # Surrogates on both sides, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000255 verify(u'\ud800\udc02' < u'\ud84d\udc56')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000256 print 'done.'
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000257
Guido van Rossuma831cac2000-03-10 23:23:21 +0000258test('ljust', u'abc', u'abc ', 10)
259test('rjust', u'abc', u' abc', 10)
260test('center', u'abc', u' abc ', 10)
261test('ljust', u'abc', u'abc ', 6)
262test('rjust', u'abc', u' abc', 6)
263test('center', u'abc', u' abc ', 6)
264test('ljust', u'abc', u'abc', 2)
265test('rjust', u'abc', u'abc', 2)
266test('center', u'abc', u'abc', 2)
267
268test('islower', u'a', 1)
269test('islower', u'A', 0)
270test('islower', u'\n', 0)
271test('islower', u'\u1FFc', 0)
272test('islower', u'abc', 1)
273test('islower', u'aBc', 0)
274test('islower', u'abc\n', 1)
275
276test('isupper', u'a', 0)
277test('isupper', u'A', 1)
278test('isupper', u'\n', 0)
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000279if sys.platform[:4] != 'java':
280 test('isupper', u'\u1FFc', 0)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000281test('isupper', u'ABC', 1)
282test('isupper', u'AbC', 0)
283test('isupper', u'ABC\n', 1)
284
285test('istitle', u'a', 0)
286test('istitle', u'A', 1)
287test('istitle', u'\n', 0)
288test('istitle', u'\u1FFc', 1)
289test('istitle', u'A Titlecased Line', 1)
290test('istitle', u'A\nTitlecased Line', 1)
291test('istitle', u'A Titlecased, Line', 1)
292test('istitle', u'Greek \u1FFcitlecases ...', 1)
293test('istitle', u'Not a capitalized String', 0)
294test('istitle', u'Not\ta Titlecase String', 0)
295test('istitle', u'Not--a Titlecase String', 0)
296
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000297test('isalpha', u'a', 1)
298test('isalpha', u'A', 1)
299test('isalpha', u'\n', 0)
300test('isalpha', u'\u1FFc', 1)
301test('isalpha', u'abc', 1)
302test('isalpha', u'aBc123', 0)
303test('isalpha', u'abc\n', 0)
304
305test('isalnum', u'a', 1)
306test('isalnum', u'A', 1)
307test('isalnum', u'\n', 0)
308test('isalnum', u'123abc456', 1)
309test('isalnum', u'a1b3c', 1)
310test('isalnum', u'aBc000 ', 0)
311test('isalnum', u'abc\n', 0)
312
Guido van Rossuma831cac2000-03-10 23:23:21 +0000313test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
314test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
315test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
316test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
317test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
318test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
Guido van Rossum7ee801d2000-04-11 15:37:02 +0000319test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], 1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000320
321test('translate', u"abababc", u'bbbc', {ord('a'):None})
322test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
323test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
324
Guido van Rossumd4d26842000-03-13 23:21:48 +0000325# Contains:
326print 'Testing Unicode contains method...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000327verify(('a' in u'abdb') == 1)
328verify(('a' in u'bdab') == 1)
329verify(('a' in u'bdaba') == 1)
330verify(('a' in u'bdba') == 1)
331verify(('a' in u'bdba') == 1)
332verify((u'a' in u'bdba') == 1)
333verify((u'a' in u'bdb') == 0)
334verify((u'a' in 'bdb') == 0)
335verify((u'a' in 'bdba') == 1)
336verify((u'a' in ('a',1,None)) == 1)
337verify((u'a' in (1,None,'a')) == 1)
338verify((u'a' in (1,None,u'a')) == 1)
339verify(('a' in ('a',1,None)) == 1)
340verify(('a' in (1,None,'a')) == 1)
341verify(('a' in (1,None,u'a')) == 1)
342verify(('a' in ('x',1,u'y')) == 0)
343verify(('a' in ('x',1,None)) == 0)
Guido van Rossumd4d26842000-03-13 23:21:48 +0000344print 'done.'
345
Guido van Rossuma831cac2000-03-10 23:23:21 +0000346# Formatting:
347print 'Testing Unicode formatting strings...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000348verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
349verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000, 3.00')
350verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000, 3.00')
351verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000, 3.50')
352verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000, 3.57')
353verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
354verify(u"%c" % (u"a",) == u'a')
355verify(u"%c" % ("a",) == u'a')
356verify(u"%c" % (34,) == u'"')
357verify(u"%c" % (36,) == u'$')
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000358if sys.platform[:4] != 'java':
359 value = u"%r, %r" % (u"abc", "abc")
360 if value != u"u'abc', 'abc'":
361 print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'
Marc-André Lemburg84625732000-06-13 12:05:36 +0000362
Marc-André Lemburg36619082001-01-17 19:11:13 +0000363verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000364try:
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000365 if sys.platform[:4] != 'java':
366 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä'.encode('utf-8'):"def"}
367 else:
368 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}
Marc-André Lemburg84625732000-06-13 12:05:36 +0000369except KeyError:
370 print '*** formatting failed for "%s"' % "u'abc, def'"
371else:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000372 verify(value == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000373
Guido van Rossum97064862000-04-10 13:52:48 +0000374# formatting jobs delegated from the string implementation:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000375verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
376verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
377verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
378verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
379verify('...%(foo)s...' % {u'foo':u"abc",'def':123} == u'...abc...')
380verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
381verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
382verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
383verify('...%s...' % u"abc" == u'...abc...')
Marc-André Lemburg542fe562001-05-02 14:21:53 +0000384verify('%*s' % (5,u'abc',) == u' abc')
385verify('%*s' % (-5,u'abc',) == u'abc ')
386verify('%*.*s' % (5,2,u'abc',) == u' ab')
387verify('%*.*s' % (5,3,u'abc',) == u' abc')
388verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10 abc')
389verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000390print 'done.'
391
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000392print 'Testing builtin unicode()...',
393
394# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
395
396verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
397
398class UnicodeSubclass(unicode):
399 pass
400
401verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
402 == u'unicode subclass becomes unicode')
403
404verify(unicode('strings are converted to unicode')
405 == u'strings are converted to unicode')
406
407class UnicodeCompat:
408 def __init__(self, x):
409 self.x = x
410 def __unicode__(self):
411 return self.x
412
413verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
414 == u'__unicode__ compatible objects are recognized')
415
416class StringCompat:
417 def __init__(self, x):
418 self.x = x
419 def __str__(self):
420 return self.x
421
422verify(unicode(StringCompat('__str__ compatible objects are recognized'))
423 == u'__str__ compatible objects are recognized')
424
425# unicode(obj) is compatible to str():
426
427o = StringCompat('unicode(obj) is compatible to str()')
428verify(unicode(o) == u'unicode(obj) is compatible to str()')
429verify(str(o) == 'unicode(obj) is compatible to str()')
430
431for obj in (123, 123.45, 123L):
432 verify(unicode(obj) == unicode(str(obj)))
433
434# unicode(obj, encoding, error) tests (this maps to
435# PyUnicode_FromEncodedObject() at C level)
436
437try:
438 unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
439except TypeError:
440 pass
441else:
442 raise TestFailed, "decoding unicode should NOT be supported"
443
444verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
445 == u'strings are decoded to unicode')
446
447verify(unicode(buffer('character buffers are decoded to unicode'),
448 'utf-8', 'strict')
449 == u'character buffers are decoded to unicode')
450
451print 'done.'
452
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000453# Test builtin codecs
454print 'Testing builtin codecs...',
455
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000456# UTF-7 specific encoding tests:
457utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
458 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
459 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
460 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
461 (u'+', '+-'),
462 (u'+-', '+--'),
463 (u'+?', '+-?'),
464 (u'\?', '+AFw?'),
465 (u'+?', '+-?'),
466 (ur'\\?', '+AFwAXA?'),
467 (ur'\\\?', '+AFwAXABc?'),
468 (ur'++--', '+-+---')]
469
470for x,y in utfTests:
471 verify( x.encode('utf-7') == y )
472
Tim Peters527e64f2001-10-04 05:36:56 +0000473try:
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000474 unicode('+3ADYAA-', 'utf-7') # surrogates not supported
475except UnicodeError:
476 pass
477else:
478 raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
479
480verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
481
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000482# UTF-8 specific encoding tests:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000483verify(u'\u20ac'.encode('utf-8') == \
484 ''.join((chr(0xe2), chr(0x82), chr(0xac))) )
485verify(u'\ud800\udc02'.encode('utf-8') == \
486 ''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))) )
487verify(u'\ud84d\udc56'.encode('utf-8') == \
488 ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000489# UTF-8 specific decoding tests
Tim Petersd2bf3b72001-01-18 02:22:22 +0000490verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000491 'utf-8') == u'\U00023456' )
Tim Petersd2bf3b72001-01-18 02:22:22 +0000492verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000493 'utf-8') == u'\U00010002' )
Tim Petersd2bf3b72001-01-18 02:22:22 +0000494verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
Marc-André Lemburg36619082001-01-17 19:11:13 +0000495 'utf-8') == u'\u20ac' )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000496
497# Other possible utf-8 test cases:
498# * strict decoding testing for all of the
499# UTF8_ERROR cases in PyUnicode_DecodeUTF8
500
Marc-André Lemburg36619082001-01-17 19:11:13 +0000501verify(unicode('hello','ascii') == u'hello')
502verify(unicode('hello','utf-8') == u'hello')
503verify(unicode('hello','utf8') == u'hello')
504verify(unicode('hello','latin-1') == u'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000505
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000506# Error handling
Guido van Rossum97064862000-04-10 13:52:48 +0000507try:
508 u'Andr\202 x'.encode('ascii')
509 u'Andr\202 x'.encode('ascii','strict')
510except ValueError:
511 pass
512else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000513 raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000514verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
515verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")
Guido van Rossum97064862000-04-10 13:52:48 +0000516
517try:
518 unicode('Andr\202 x','ascii')
519 unicode('Andr\202 x','ascii','strict')
520except ValueError:
521 pass
522else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000523 raise TestFailed, "unicode('Andr\202') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000524verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
525verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
Guido van Rossum97064862000-04-10 13:52:48 +0000526
Marc-André Lemburg36619082001-01-17 19:11:13 +0000527verify(u'hello'.encode('ascii') == 'hello')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000528verify(u'hello'.encode('utf-7') == 'hello')
Marc-André Lemburg36619082001-01-17 19:11:13 +0000529verify(u'hello'.encode('utf-8') == 'hello')
530verify(u'hello'.encode('utf8') == 'hello')
531verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
532verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
533verify(u'hello'.encode('latin-1') == 'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000534
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000535# Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000536u = u''.join(map(unichr, range(1024)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000537for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000538 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
Marc-André Lemburg36619082001-01-17 19:11:13 +0000539 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000540
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000541# Roundtrip safety for non-BMP (just a few chars)
542u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
543for encoding in ('utf-8',
544 'utf-16', 'utf-16-le', 'utf-16-be',
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +0000545 #'raw_unicode_escape',
546 'unicode_escape', 'unicode_internal'):
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000547 verify(unicode(u.encode(encoding),encoding) == u)
548
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000549u = u''.join(map(unichr, range(256)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000550for encoding in (
551 'latin-1',
552 ):
553 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000554 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000555 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000556 print '*** codec "%s" failed round-trip' % encoding
557 except ValueError,why:
558 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000559
560u = u''.join(map(unichr, range(128)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000561for encoding in (
562 'ascii',
563 ):
564 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000565 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000566 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000567 print '*** codec "%s" failed round-trip' % encoding
568 except ValueError,why:
569 print '*** codec for "%s" failed: %s' % (encoding, why)
570
571print 'done.'
572
573print 'Testing standard mapping codecs...',
574
575print '0-127...',
576s = ''.join(map(chr, range(128)))
577for encoding in (
578 'cp037', 'cp1026',
579 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
580 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000581 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000582 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
583 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
584 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
585 'mac_cyrillic', 'mac_latin2',
586
587 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
588 'cp1256', 'cp1257', 'cp1258',
589 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
590
591 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Tim Peters2f228e72001-05-13 00:19:31 +0000592 'cp1006', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000593
Guido van Rossum9e896b32000-04-05 20:11:21 +0000594 ### These have undefined mappings:
595 #'cp424',
Fred Drake004d5e62000-10-23 17:22:08 +0000596
Tim Peters2f228e72001-05-13 00:19:31 +0000597 ### These fail the round-trip:
598 #'cp875'
599
Guido van Rossum9e896b32000-04-05 20:11:21 +0000600 ):
601 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000602 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000603 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000604 print '*** codec "%s" failed round-trip' % encoding
605 except ValueError,why:
606 print '*** codec for "%s" failed: %s' % (encoding, why)
607
608print '128-255...',
609s = ''.join(map(chr, range(128,256)))
610for encoding in (
611 'cp037', 'cp1026',
612 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
613 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000614 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000615 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000616 'iso8859_2', 'iso8859_4', 'iso8859_5',
Marc-André Lemburga866df82001-01-03 21:29:14 +0000617 'iso8859_9', 'koi8_r', 'latin_1',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000618 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000619
Guido van Rossum9e896b32000-04-05 20:11:21 +0000620 ### These have undefined mappings:
621 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
622 #'cp1256', 'cp1257', 'cp1258',
623 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000624 #'iso8859_3', 'iso8859_6', 'iso8859_7',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000625 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000626
Guido van Rossum9e896b32000-04-05 20:11:21 +0000627 ### These fail the round-trip:
628 #'cp1006', 'cp875', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000629
Guido van Rossum9e896b32000-04-05 20:11:21 +0000630 ):
631 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000632 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000633 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000634 print '*** codec "%s" failed round-trip' % encoding
635 except ValueError,why:
636 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000637
638print 'done.'
Fred Drakee0243e22000-04-13 14:11:56 +0000639
640print 'Testing Unicode string concatenation...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000641verify((u"abc" u"def") == u"abcdef")
642verify(("abc" u"def") == u"abcdef")
643verify((u"abc" "def") == u"abcdef")
644verify((u"abc" u"def" "ghi") == u"abcdefghi")
645verify(("abc" "def" u"ghi") == u"abcdefghi")
Fred Drakee0243e22000-04-13 14:11:56 +0000646print 'done.'